Index: head/sys/alpha/alpha/pmap.c =================================================================== --- head/sys/alpha/alpha/pmap.c (revision 132081) +++ head/sys/alpha/alpha/pmap.c (revision 132082) @@ -1,2825 +1,2824 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 1998 Doug Rabson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ /* * Notes for alpha pmap. * * On alpha, pm_pdeobj will hold lev1, lev2 and lev3 page tables. * Indices from 0 to NUSERLEV3MAPS-1 will map user lev3 page tables, * indices from NUSERLEV3MAPS to NUSERLEV3MAPS+NUSERLEV2MAPS-1 will * map user lev2 page tables and index NUSERLEV3MAPS+NUSERLEV2MAPS * will map the lev1 page table. The lev1 table will self map at * address VADDR(PTLEV1I,0,0). * * The vm_object kptobj holds the kernel page tables on i386 (62 or 63 * of them, depending on whether the system is SMP). On alpha, kptobj * will hold the lev3 and lev2 page tables for K1SEG. Indices 0 to * NKLEV3MAPS-1 will map kernel lev3 page tables and indices * NKLEV3MAPS to NKLEV3MAPS+NKLEV2MAPS will map lev2 page tables. (XXX * should the kernel Lev1map be inserted into this object?). * * pvtmmap is not needed for alpha since K0SEG maps all of physical * memory. * * * alpha virtual memory map: * * * Address Lev1 index * * --------------------------------- * 0000000000000000 | | 0 * | | * | | * | | * | | * --- --- * User space (USEG) * --- --- * | | * | | * | | * | | * 000003ffffffffff | | 511=UMAXLEV1I * --------------------------------- * fffffc0000000000 | | 512=K0SEGLEV1I * | Kernel code/data/bss | * | | * | | * | | * --- --- * K0SEG * --- --- * | | * | 1-1 physical/virtual | * | | * | | * fffffdffffffffff | | * --------------------------------- * fffffe0000000000 | | 768=K1SEGLEV1I * | Kernel dynamic data | * | | * | | * | | * --- --- * K1SEG * --- --- * | | * | mapped by ptes | * | | * | | * fffffff7ffffffff | | * --------------------------------- * fffffffe00000000 | | 1023=PTLEV1I * | PTmap (pte self map) | * ffffffffffffffff | | * --------------------------------- * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if 0 #define PMAP_DIAGNOSTIC #define PMAP_DEBUG #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Some macros for manipulating virtual addresses */ #define ALPHA_L1SIZE (1L << ALPHA_L1SHIFT) #define ALPHA_L2SIZE (1L << ALPHA_L2SHIFT) #define alpha_l1trunc(va) ((va) & ~(ALPHA_L1SIZE-1)) #define alpha_l2trunc(va) ((va) & ~(ALPHA_L2SIZE-1)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pte_w(pte) ((*(pte) & PG_W) != 0) #define pmap_pte_managed(pte) ((*(pte) & PG_MANAGED) != 0) #define pmap_pte_v(pte) ((*(pte) & PG_V) != 0) #define pmap_pte_pa(pte) alpha_ptob(ALPHA_PTE_TO_PFN(*(pte))) #define pmap_pte_prot(pte) (*(pte) & PG_PROT) #define pmap_pte_set_w(pte, v) ((v)?(*pte |= PG_W):(*pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*pte &= ~PG_PROT), (*pte |= (v))) /* * Given a map and a machine independent protection code, * convert to an alpha protection code. */ #define pte_prot(m, p) (protection_codes[m == kernel_pmap ? 0 : 1][p]) int protection_codes[2][8]; /* * Return non-zero if this pmap is currently active */ #define pmap_isactive(pmap) (pmap->pm_active) /* * Extract level 1, 2 and 3 page table indices from a va */ #define PTMASK ((1 << ALPHA_PTSHIFT) - 1) #define pmap_lev1_index(va) (((va) >> ALPHA_L1SHIFT) & PTMASK) #define pmap_lev2_index(va) (((va) >> ALPHA_L2SHIFT) & PTMASK) #define pmap_lev3_index(va) (((va) >> ALPHA_L3SHIFT) & PTMASK) /* * Given a physical address, construct a pte */ #define pmap_phys_to_pte(pa) ALPHA_PTE_FROM_PFN(alpha_btop(pa)) /* * Given a page frame number, construct a k0seg va */ #define pmap_k0seg_to_pfn(va) alpha_btop(ALPHA_K0SEG_TO_PHYS(va)) /* * Given a pte, construct a k0seg va */ #define pmap_k0seg_to_pte(va) ALPHA_PTE_FROM_PFN(pmap_k0seg_to_pfn(va)) /* * Lev1map: * * Kernel level 1 page table. This maps all kernel level 2 * page table pages, and is used as a template for all user * pmap level 1 page tables. When a new user level 1 page * table is allocated, all Lev1map PTEs for kernel addresses * are copied to the new map. * * Lev2map: * * Initial set of kernel level 2 page table pages. These * map the kernel level 3 page table pages. As kernel * level 3 page table pages are added, more level 2 page * table pages may be added to map them. These pages are * never freed. * * Lev3map: * * Initial set of kernel level 3 page table pages. These * map pages in K1SEG. More level 3 page table pages may * be added at run-time if additional K1SEG address space * is required. These pages are never freed. * * Lev2mapsize: * * Number of entries in the initial Lev2map. * * Lev3mapsize: * * Number of entries in the initial Lev3map. * * NOTE: When mappings are inserted into the kernel pmap, all * level 2 and level 3 page table pages must already be allocated * and mapped into the parent page table. */ pt_entry_t *Lev1map, *Lev2map, *Lev3map; vm_size_t Lev2mapsize, Lev3mapsize; /* * Statically allocated kernel pmap */ struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int nklev3, nklev2; vm_offset_t kernel_vm_end; /* * Data for the ASN allocator */ static int pmap_maxasn; static pmap_t pmap_active[MAXCPU]; static LIST_HEAD(,pmap) allpmaps; static struct mtx allpmaps_lock; /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void alpha_protection_init(void); static void pmap_changebit(vm_page_t m, int bit, boolean_t setem); static int pmap_remove_pte(pmap_t pmap, pt_entry_t* ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); static int pmap_release_free_page(pmap_t pmap, vm_page_t p); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); #ifdef SMP static void pmap_invalidate_page_action(void *arg); static void pmap_invalidate_all_action(void *arg); #endif /* * Routine: pmap_lev1pte * Function: * Extract the level 1 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev1pte(pmap_t pmap, vm_offset_t va) { if (!pmap) return 0; return &pmap->pm_lev1[pmap_lev1_index(va)]; } /* * Routine: pmap_lev2pte * Function: * Extract the level 2 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev2pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l1pte; pt_entry_t* l2map; l1pte = pmap_lev1pte(pmap, va); if (!pmap_pte_v(l1pte)) return 0; l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); return &l2map[pmap_lev2_index(va)]; } /* * Routine: pmap_lev3pte * Function: * Extract the level 3 page table entry associated * with the given map/virtual_address pair. */ static PMAP_INLINE pt_entry_t* pmap_lev3pte(pmap_t pmap, vm_offset_t va) { pt_entry_t* l2pte; pt_entry_t* l3map; l2pte = pmap_lev2pte(pmap, va); if (!l2pte || !pmap_pte_v(l2pte)) return 0; l3map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l2pte)); return &l3map[pmap_lev3_index(va)]; } vm_offset_t pmap_steal_memory(vm_size_t size) { vm_size_t bank_size; vm_offset_t pa, va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i+2]; i+= 2) { phys_avail[i] = phys_avail[i+2]; phys_avail[i+1] = phys_avail[i+3]; } phys_avail[i] = 0; phys_avail[i+1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; va = ALPHA_PHYS_TO_K0SEG(pa); bzero((caddr_t) va, size); return va; } extern pt_entry_t rom_pte; /* XXX */ extern int prom_mapped; /* XXX */ /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t ptaddr, u_int maxasn) { pt_entry_t newpte; int i; /* * Setup ASNs. PCPU_GET(next_asn) and PCPU_GET(current_asngen) are set * up already. */ pmap_maxasn = maxasn; /* * Allocate a level 1 map for the kernel. */ Lev1map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); /* * Allocate a level 2 map for the kernel */ Lev2map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE); Lev2mapsize = PAGE_SIZE; /* * Allocate some level 3 maps for the kernel */ Lev3map = (pt_entry_t*) pmap_steal_memory(PAGE_SIZE*NKPT); Lev3mapsize = NKPT * PAGE_SIZE; /* Map all of the level 2 maps */ for (i = 0; i < howmany(Lev2mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev2map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev1map[K1SEGLEV1I + i] = newpte; } /* Setup the mapping for the prom console */ { if (pmap_uses_prom_console()) { /* XXX save old pte so that we can remap prom if necessary */ rom_pte = *(pt_entry_t *)ptaddr & ~PG_ASM; /* XXX */ } prom_mapped = 0; /* * Actually, this code lies. The prom is still mapped, and will * remain so until the context switch after alpha_init() returns. * Printfs using the firmware before then will end up frobbing * Lev1map unnecessarily, but that's OK. */ } /* * Level 1 self mapping. * * Don't set PG_ASM since the self-mapping is different for each * address space. */ newpte = pmap_k0seg_to_pte((vm_offset_t) Lev1map); newpte |= PG_V | PG_KRE | PG_KWE; Lev1map[PTLEV1I] = newpte; /* Map all of the level 3 maps */ for (i = 0; i < howmany(Lev3mapsize, PAGE_SIZE); i++) { unsigned long pfn = pmap_k0seg_to_pfn((vm_offset_t) Lev3map) + i; newpte = ALPHA_PTE_FROM_PFN(pfn); newpte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_W; Lev2map[i] = newpte; } virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VPTBASE; /* * Initialize protection array. */ alpha_protection_init(); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_lev1 = Lev1map; kernel_pmap->pm_active = ~0; kernel_pmap->pm_asn[alpha_pal_whami()].asn = 0; kernel_pmap->pm_asn[alpha_pal_whami()].gen = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nklev3 = NKPT; nklev2 = 1; /* * Initialize list of pmaps. */ LIST_INIT(&allpmaps); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); /* * Set up proc0's PCB such that the ptbr points to the right place * and has the kernel pmap's. */ thread0.td_pcb->pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t)Lev1map) >> PAGE_SHIFT; thread0.td_pcb->pcb_hw.apcb_asn = 0; } int pmap_uses_prom_console() { int cputype; cputype = hwrpb->rpb_type; return (cputype == ST_DEC_21000 || ST_DEC_4100); return 0; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); } /*************************************************** * Manipulate TLBs for a pmap ***************************************************/ static void pmap_invalidate_asn(pmap_t pmap) { pmap->pm_asn[PCPU_GET(cpuid)].gen = 0; } struct pmap_invalidate_page_arg { pmap_t pmap; vm_offset_t va; }; static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { #ifdef SMP struct pmap_invalidate_page_arg arg; arg.pmap = pmap; arg.va = va; smp_rendezvous(0, pmap_invalidate_page_action, 0, (void *) &arg); } static void pmap_invalidate_page_action(void *arg) { pmap_t pmap = ((struct pmap_invalidate_page_arg *) arg)->pmap; vm_offset_t va = ((struct pmap_invalidate_page_arg *) arg)->va; #endif if (pmap->pm_active & PCPU_GET(cpumask)) { ALPHA_TBIS(va); alpha_pal_imb(); /* XXX overkill? */ } else { pmap_invalidate_asn(pmap); } } static void pmap_invalidate_all(pmap_t pmap) { #ifdef SMP smp_rendezvous(0, pmap_invalidate_all_action, 0, (void *) pmap); } static void pmap_invalidate_all_action(void *arg) { pmap_t pmap = (pmap_t) arg; #endif if (pmap->pm_active & PCPU_GET(cpumask)) { ALPHA_TBIA(); alpha_pal_imb(); /* XXX overkill? */ } else pmap_invalidate_asn(pmap); } static void pmap_get_asn(pmap_t pmap) { if (PCPU_GET(next_asn) > pmap_maxasn) { /* * Start a new ASN generation. * * Invalidate all per-process mappings and I-cache */ PCPU_SET(next_asn, 0); PCPU_SET(current_asngen, (PCPU_GET(current_asngen) + 1) & ASNGEN_MASK); if (PCPU_GET(current_asngen) == 0) { /* * Clear the pm_asn[].gen of all pmaps. * This is safe since it is only called from * pmap_activate after it has deactivated * the old pmap and it only affects this cpu. */ pmap_t tpmap; #ifdef PMAP_DIAGNOSTIC printf("pmap_get_asn: generation rollover\n"); #endif PCPU_SET(current_asngen, 1); mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(tpmap, &allpmaps, pm_list) { tpmap->pm_asn[PCPU_GET(cpuid)].gen = 0; } mtx_unlock_spin(&allpmaps_lock); } /* * Since we are about to start re-using ASNs, we must * clear out the TLB and the I-cache since they are tagged * with the ASN. */ ALPHA_TBIAP(); alpha_pal_imb(); /* XXX overkill? */ } pmap->pm_asn[PCPU_GET(cpuid)].asn = PCPU_GET(next_asn); PCPU_SET(next_asn, PCPU_GET(next_asn) + 1); pmap->pm_asn[PCPU_GET(cpuid)].gen = PCPU_GET(current_asngen); } /*************************************************** * Low level helper routines..... ***************************************************/ /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t* pte = pmap_lev3pte(pmap, va); if (pte) return alpha_ptob(ALPHA_PTE_TO_PFN(*pte)); else return 0; } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa; vm_page_t m; m = NULL; mtx_lock(&Giant); if ((pa = pmap_extract(pmap, va)) != 0) { m = PHYS_TO_VM_PAGE(pa); vm_page_lock_queues(); vm_page_hold(m); vm_page_unlock_queues(); } mtx_unlock(&Giant); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; pt_entry_t *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; pt_entry_t npte = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m[i])) | PG_ASM | PG_KRE | PG_KWE | PG_V; pt_entry_t opte; pte = vtopte(tva); opte = *pte; *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, tva); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); va += PAGE_SIZE; } } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { pt_entry_t *pte; pt_entry_t npte, opte; npte = pmap_phys_to_pte(pa) | PG_ASM | PG_KRE | PG_KWE | PG_V; pte = vtopte(va); opte = *pte; *pte = npte; if (opte) pmap_invalidate_page(kernel_pmap, va); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { register pt_entry_t *pte; pte = vtopte(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { return ALPHA_PHYS_TO_K0SEG(start); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; pt_entry_t* pte; /* * unmap the page table page */ if (m->pindex >= NUSERLEV3MAPS) { /* Level 2 page table */ pte = pmap_lev1pte(pmap, va); pteva = (vm_offset_t) PTlev2 + alpha_ptob(m->pindex - NUSERLEV3MAPS); } else { /* Level 3 page table */ pte = pmap_lev2pte(pmap, va); pteva = (vm_offset_t) PTmap + alpha_ptob(m->pindex); } *pte = 0; if (m->pindex < NUSERLEV3MAPS) { /* unhold the level 2 page table */ vm_page_t lev2pg; lev2pg = PHYS_TO_VM_PAGE(pmap_pte_pa(pmap_lev1pte(pmap, va))); vm_page_unhold(lev2pg); if (lev2pg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, lev2pg); } --pmap->pm_stats.resident_count; /* * Do a invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); if (pmap->pm_ptphint == m) pmap->pm_ptphint = NULL; /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, va, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { unsigned ptepindex; if (va >= VM_MAXUSER_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> ALPHA_L2SHIFT); if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = PHYS_TO_VM_PAGE(pmap_pte_pa(pmap_lev2pte(pmap, va))); pmap->pm_ptphint = mpte; } } return pmap_unwire_pte_hold(pmap, va, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { int i; PMAP_LOCK_INIT(pmap); pmap->pm_lev1 = Lev1map; pmap->pm_ptphint = NULL; pmap->pm_active = 0; for (i = 0; i < MAXCPU; i++) { pmap->pm_asn[i].asn = 0; pmap->pm_asn[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN | MTX_QUIET); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t lev1pg; int i; PMAP_LOCK_INIT(pmap); /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUSERLEV3MAPS + NUSERLEV2MAPS + 1); /* * allocate the page directory page */ VM_OBJECT_LOCK(pmap->pm_pteobj); lev1pg = vm_page_grab(pmap->pm_pteobj, NUSERLEV3MAPS + NUSERLEV2MAPS, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); vm_page_lock_queues(); vm_page_flag_clear(lev1pg, PG_BUSY); lev1pg->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); VM_OBJECT_UNLOCK(pmap->pm_pteobj); pmap->pm_lev1 = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(lev1pg)); /* install self-referential address mapping entry (not PG_ASM) */ pmap->pm_lev1[PTLEV1I] = pmap_phys_to_pte(VM_PAGE_TO_PHYS(lev1pg)) | PG_V | PG_KRE | PG_KWE; pmap->pm_ptphint = NULL; pmap->pm_active = 0; for (i = 0; i < MAXCPU; i++) { pmap->pm_asn[i].asn = 0; pmap->pm_asn[i].gen = 0; } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); bcopy(PTlev1 + K1SEGLEV1I, pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); } static int pmap_release_free_page(pmap_t pmap, vm_page_t p) { pt_entry_t* pte; pt_entry_t* l2map; if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) /* level 1 page table */ pte = &pmap->pm_lev1[PTLEV1I]; else if (p->pindex >= NUSERLEV3MAPS) /* level 2 page table */ pte = &pmap->pm_lev1[p->pindex - NUSERLEV3MAPS]; else { /* level 3 page table */ pte = &pmap->pm_lev1[p->pindex >> ALPHA_PTSHIFT]; l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(pte)); pte = &l2map[p->pindex & ((1 << ALPHA_PTSHIFT) - 1)]; } /* * This code optimizes the case of freeing non-busy * page-table pages. Those pages are zero now, and * might as well be placed directly into the zero queue. */ vm_page_lock_queues(); if (vm_page_sleep_if_busy(p, FALSE, "pmaprl")) return 0; vm_page_busy(p); /* * Remove the page table page from the processes address space. */ *pte = 0; pmap->pm_stats.resident_count--; #ifdef PMAP_DEBUG if (p->hold_count) { panic("pmap_release: freeing held page table page"); } #endif /* * Level1 pages need to have the kernel * stuff cleared, so they can go into the zero queue also. */ if (p->pindex == NUSERLEV3MAPS + NUSERLEV2MAPS) bzero(pmap->pm_lev1 + K1SEGLEV1I, nklev2 * PTESIZE); if (pmap->pm_ptphint == p) pmap->pm_ptphint = NULL; #ifdef PMAP_DEBUG { u_long *lp = (u_long*) ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(p)); u_long *ep = (u_long*) ((char*) lp + PAGE_SIZE); for (; lp < ep; lp++) if (*lp != 0) panic("pmap_release_free_page: page not zero"); } #endif p->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(p); vm_page_unlock_queues(); return 1; } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { pt_entry_t* pte; vm_offset_t ptepa; vm_page_t m; int is_object_locked; /* * Find or fabricate a new pagetable page */ if (!(is_object_locked = VM_OBJECT_LOCKED(pmap->pm_pteobj))) VM_OBJECT_LOCK(pmap->pm_pteobj); m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); if (ptepindex >= NUSERLEV3MAPS) { pte = &pmap->pm_lev1[ptepindex - NUSERLEV3MAPS]; } else { int l1index = ptepindex >> ALPHA_PTSHIFT; pt_entry_t* l1pte = &pmap->pm_lev1[l1index]; pt_entry_t* l2map; if (!pmap_pte_v(l1pte)) _pmap_allocpte(pmap, NUSERLEV3MAPS + l1index); else { vm_page_t l2page; l2page = PHYS_TO_VM_PAGE(pmap_pte_pa(l1pte)); l2page->hold_count++; } l2map = (pt_entry_t*) ALPHA_PHYS_TO_K0SEG(pmap_pte_pa(l1pte)); pte = &l2map[ptepindex & ((1 << ALPHA_PTSHIFT) - 1)]; } *pte = pmap_phys_to_pte(ptepa) | PG_KRE | PG_KWE | PG_V; /* * Set the page table hint */ pmap->pm_ptphint = m; vm_page_lock_queues(); m->valid = VM_PAGE_BITS_ALL; vm_page_wakeup(m); vm_page_unlock_queues(); if (!is_object_locked) VM_OBJECT_UNLOCK(pmap->pm_pteobj); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { unsigned ptepindex; pt_entry_t* lev2pte; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> (PAGE_SHIFT + ALPHA_PTSHIFT); /* * Get the level2 entry */ lev2pte = pmap_lev2pte(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (lev2pte && pmap_pte_v(lev2pte)) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { m = pmap->pm_ptphint; } else { m = PHYS_TO_VM_PAGE(pmap_pte_pa(lev2pte)); pmap->pm_ptphint = m; } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t p,n,lev1pg; vm_object_t object = pmap->pm_pteobj; int curgeneration; #if defined(DIAGNOSTIC) if (object->ref_count != 1) panic("pmap_release: pteobj reference count != 1"); #endif lev1pg = NULL; retry: curgeneration = object->generation; for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex >= NUSERLEV3MAPS) { continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } for (p = TAILQ_FIRST(&object->memq); p != NULL; p = n) { n = TAILQ_NEXT(p, listq); if (p->pindex < NUSERLEV3MAPS) { /* can this happen? maybe panic */ goto retry; } if (p->pindex >= NUSERLEV3MAPS + NUSERLEV2MAPS) { lev1pg = p; continue; } while (1) { if (!pmap_release_free_page(pmap, p) && (object->generation != curgeneration)) goto retry; } } if (lev1pg && !pmap_release_free_page(pmap, lev1pg)) goto retry; mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); PMAP_LOCK_DESTROY(pmap); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { /* XXX come back to this */ struct pmap *pmap; pt_entry_t* pte; pt_entry_t newlev1, newlev2; vm_offset_t pa; vm_page_t nkpg; critical_enter(); if (kernel_vm_end == 0) { kernel_vm_end = VM_MIN_KERNEL_ADDRESS;; /* Count the level 2 page tables */ nklev2 = 0; nklev3 = 0; while (pmap_pte_v(pmap_lev1pte(kernel_pmap, kernel_vm_end))) { nklev2++; nklev3 += (1L << ALPHA_PTSHIFT); kernel_vm_end += ALPHA_L1SIZE; } /* Count the level 3 page tables in the last level 2 page table */ kernel_vm_end -= ALPHA_L1SIZE; nklev3 -= (1 << ALPHA_PTSHIFT); while (pmap_pte_v(pmap_lev2pte(kernel_pmap, kernel_vm_end))) { nklev3++; kernel_vm_end += ALPHA_L2SIZE; } } addr = (addr + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); while (kernel_vm_end < addr) { /* * If the level 1 pte is invalid, allocate a new level 2 page table */ pte = pmap_lev1pte(kernel_pmap, kernel_vm_end); if (!pmap_pte_v(pte)) { int pindex = NKLEV3MAPS + pmap_lev1_index(kernel_vm_end) - K1SEGLEV1I; nkpg = vm_page_alloc(NULL, pindex, VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); printf("pmap_growkernel: growing to %lx\n", addr); printf("pmap_growkernel: adding new level2 page table\n"); nklev2++; pmap_zero_page(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); newlev1 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { *pmap_lev1pte(pmap, kernel_vm_end) = newlev1; } mtx_unlock_spin(&allpmaps_lock); *pte = newlev1; pmap_invalidate_all(kernel_pmap); } /* * If the level 2 pte is invalid, allocate a new level 3 page table */ pte = pmap_lev2pte(kernel_pmap, kernel_vm_end); if (pmap_pte_v(pte)) { kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nklev3, VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nklev3++; pmap_zero_page(nkpg); pa = VM_PAGE_TO_PHYS(nkpg); newlev2 = pmap_phys_to_pte(pa) | PG_V | PG_ASM | PG_KRE | PG_KWE; *pte = newlev2; kernel_vm_end = (kernel_vm_end + ALPHA_L2SIZE) & ~(ALPHA_L2SIZE - 1); } critical_exit(); } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; int s; s = splvm(); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) { int s; pv_entry_t pv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; vm_page_lock_queues(); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; vm_page_unlock_queues(); splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m; oldpte = *ptq; *ptq = 0; if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(&oldpte)); if ((oldpte & PG_FOW) == 0) { if (pmap_track_modified(va)) vm_page_dirty(m); } if ((oldpte & PG_FOR) == 0) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { register pt_entry_t *ptq; ptq = pmap_lev3pte(pmap, va); /* * if there is no pte for this address, just skip it!!! */ if (!ptq || !pmap_pte_v(ptq)) return; /* * get a local va for mappings for this pmap. */ (void) pmap_remove_pte(pmap, ptq, va); pmap_invalidate_page(pmap, va); return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va, nva; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pmap_remove_page(pmap, sva); return; } for (va = sva; va < eva; va = nva) { if (!pmap_pte_v(pmap_lev1pte(pmap, va))) { nva = alpha_l1trunc(va + ALPHA_L1SIZE); continue; } if (!pmap_pte_v(pmap_lev2pte(pmap, va))) { nva = alpha_l2trunc(va + ALPHA_L2SIZE); continue; } pmap_remove_page(pmap, va); nva = va + PAGE_SIZE; } } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); pv->pv_pmap->pm_stats.resident_count--; if (pmap_pte_pa(pte) != VM_PAGE_TO_PHYS(m)) panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m)); tpte = *pte; *pte = 0; if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; /* * Update the vm_page_t clean and reference bits. */ if ((tpte & PG_FOW) == 0) { if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } if ((tpte & PG_FOR) == 0) vm_page_flag_set(m, PG_REFERENCED); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); PMAP_UNLOCK(pv->pv_pmap); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pt_entry_t* pte; int newprot; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; newprot = pte_prot(pmap, prot); if ((sva & PAGE_MASK) || (eva & PAGE_MASK)) panic("pmap_protect: unaligned addresses"); PMAP_LOCK(pmap); while (sva < eva) { /* * If level 1 pte is invalid, skip this segment */ pte = pmap_lev1pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l1trunc(sva) + ALPHA_L1SIZE; continue; } /* * If level 2 pte is invalid, skip this segment */ pte = pmap_lev2pte(pmap, sva); if (!pmap_pte_v(pte)) { sva = alpha_l2trunc(sva) + ALPHA_L2SIZE; continue; } /* * If level 3 pte is invalid, skip this page */ pte = pmap_lev3pte(pmap, sva); if (!pmap_pte_v(pte)) { sva += PAGE_SIZE; continue; } if (pmap_pte_prot(pte) != newprot) { pt_entry_t oldpte = *pte; vm_page_t m = NULL; if ((oldpte & PG_FOR) == 0) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); vm_page_flag_set(m, PG_REFERENCED); oldpte |= (PG_FOR | PG_FOE); } if ((oldpte & PG_FOW) == 0) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); if (pmap_track_modified(sva)) vm_page_dirty(m); oldpte |= PG_FOW; } oldpte = (oldpte & ~PG_PROT) | newprot; *pte = oldpte; pmap_invalidate_page(pmap, sva); } sva += PAGE_SIZE; } PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_offset_t pa; pt_entry_t *pte; vm_offset_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; int managed; if (pmap == NULL) return; va &= ~PAGE_MASK; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } pte = pmap_lev3pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid kernel page tables pmap=%p, va=0x%lx\n", pmap, va); } origpte = *pte; pa = VM_PAGE_TO_PHYS(m) & ~PAGE_MASK; managed = 0; opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_FOW) != PG_FOW && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } } managed = origpte & PG_MANAGED; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); err = pmap_remove_pte(pmap, pte, va); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%lx", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); managed |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = pmap_phys_to_pte(pa) | pte_prot(pmap, prot) | PG_V | managed; if (managed) { /* * Set up referenced/modified emulation for the new * mapping. Any old referenced/modified emulation * results for the old mapping will have been recorded * either in pmap_remove_pte() or above in the code * which handles protection and/or wiring changes. */ newpte |= (PG_FOR | PG_FOW | PG_FOE); } if (wired) newpte |= PG_W; /* * if the mapping or permission bits are different, we need * to update the pte. */ if (origpte != newpte) { *pte = newpte; if (origpte) pmap_invalidate_page(pmap, va); if (prot & VM_PROT_EXECUTE) alpha_pal_imb(); } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { register pt_entry_t *pte; int managed; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pt_entry_t* l2pte; /* * Calculate lev2 page index */ ptepindex = va >> ALPHA_L2SHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { /* * Get the level 2 entry */ l2pte = pmap_lev2pte(pmap, va); /* * If the level 2 page table is mapped, we just increment * the hold count, and activate it. */ if (l2pte && pmap_pte_v(l2pte)) { if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == ptepindex)) { mpte = pmap->pm_ptphint; } else { mpte = PHYS_TO_VM_PAGE(pmap_pte_pa(l2pte)); pmap->pm_ptphint = mpte; } mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, va, mpte); vm_page_unlock_queues(); } alpha_pal_imb(); /* XXX overkill? */ return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ managed = 0; if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); managed = PG_MANAGED | PG_FOR | PG_FOW | PG_FOE; } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ *pte = pmap_phys_to_pte(VM_PAGE_TO_PHYS(m)) | PG_V | PG_KRE | PG_URE | managed; alpha_pal_imb(); /* XXX overkill? */ return mpte; } /* * Make temporary mapping for a physical address. This is called * during dump. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { return (void *) ALPHA_PHYS_TO_K0SEG(pa - (i * PAGE_SIZE)); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { pt_entry_t *pte; if (pmap == NULL) return; PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((char *)(caddr_t)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. This is for the vm_pagezero idle process. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(mdst)); bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); /* * Not found, check current mappings returning immediately if found. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } loops++; if (loops >= 16) break; } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; - int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif - s = splvm(); + vm_page_lock_queues(); PMAP_LOCK(pmap); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif if (!pmap_pte_v(pte)) panic("pmap_remove_pages: page on pm_pvlist has no pte\n"); tpte = *pte; /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } *pte = 0; m = PHYS_TO_VM_PAGE(pmap_pte_pa(&tpte)); pv->pv_pmap->pm_stats.resident_count--; npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } - splx(s); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); } /* * this routine is used to modify bits in ptes */ static void pmap_changebit(vm_page_t m, int bit, boolean_t setem) { pv_entry_t pv; pt_entry_t *pte; int changed; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (!setem && bit == (PG_UWE|PG_KWE) && (m->flags & PG_WRITEABLE) == 0)) return; s = splvm(); changed = 0; /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (!setem && bit == (PG_UWE|PG_KWE)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); changed = 0; if (setem) { *pte |= bit; changed = 1; } else { pt_entry_t pbits = *pte; if (pbits & bit) { changed = 1; *pte = pbits & ~bit; } } if (changed) pmap_invalidate_page(pv->pv_pmap, pv->pv_va); PMAP_UNLOCK(pv->pv_pmap); } if (!setem && bit == (PG_UWE|PG_KWE)) vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_KWE|PG_UWE, FALSE); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; int count; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return 0; /* * Loop over current mappings looking for any which have don't * have PG_FOR set (i.e. ones where we have taken an emulate * reference trap recently). */ count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & PG_FOR)) { count++; *pte |= PG_FOR | PG_FOE; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } return count; } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; /* * A page is modified if any mapping has had its PG_FOW flag * cleared. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & PG_FOW)) return 1; } return 0; } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; if (!pmap_pte_v(pmap_lev1pte(pmap, addr)) || !pmap_pte_v(pmap_lev2pte(pmap, addr))) return (FALSE); pte = vtopte(addr); if (*pte) return (FALSE); return (TRUE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; /* * Loop over current mappings setting PG_FOW where needed. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & PG_FOW)) { *pte |= PG_FOW; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; /* * Loop over current mappings setting PG_FOR and PG_FOE where needed. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { PMAP_LOCK(pv->pv_pmap); pte = pmap_lev3pte(pv->pv_pmap, pv->pv_va); if (!(*pte & (PG_FOR | PG_FOE))) { *pte |= (PG_FOR | PG_FOE); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } } /* * pmap_emulate_reference: * * Emulate reference and/or modified bit hits. * From NetBSD */ void pmap_emulate_reference(struct vmspace *vm, vm_offset_t v, int user, int write) { pmap_t pmap; pt_entry_t faultoff, *pte; /* * Convert process and virtual address to physical address. */ if (v >= VM_MIN_KERNEL_ADDRESS) { if (user) panic("pmap_emulate_reference: user ref to kernel"); pmap = kernel_pmap; PMAP_LOCK(pmap); pte = vtopte(v); } else { KASSERT(vm != NULL, ("pmap_emulate_reference: bad vmspace")); pmap = &vm->vm_pmap; PMAP_LOCK(pmap); pte = pmap_lev3pte(pmap, v); } #ifdef DEBUG /* These checks are more expensive */ if (!pmap_pte_v(pte)) panic("pmap_emulate_reference: invalid pte"); #if 0 /* * Can't do these, because cpu_fork and cpu_swapin call * pmap_emulate_reference(), and the bits aren't guaranteed, * for them... */ if (write) { if (!(*pte & (user ? PG_UWE : PG_UWE | PG_KWE))) panic("pmap_emulate_reference: write but unwritable"); if (!(*pte & PG_FOW)) panic("pmap_emulate_reference: write but not FOW"); } else { if (!(*pte & (user ? PG_URE : PG_URE | PG_KRE))) panic("pmap_emulate_reference: !write but unreadable"); if (!(*pte & (PG_FOR | PG_FOE))) panic("pmap_emulate_reference: !write but not FOR|FOE"); } #endif /* Other diagnostics? */ #endif KASSERT((*pte & PG_MANAGED) != 0, ("pmap_emulate_reference(%p, 0x%lx, %d, %d): pa 0x%lx not managed", curthread, v, user, write, pmap_pte_pa(pte))); /* * Twiddle the appropriate bits to reflect the reference * and/or modification.. * * The rules: * (1) always mark page as used, and * (2) if it was a write fault, mark page as modified. */ if (write) { faultoff = PG_FOR | PG_FOE | PG_FOW; } else { faultoff = PG_FOR | PG_FOE; } *pte = (*pte & ~faultoff); ALPHA_TBIS(v); PMAP_UNLOCK(pmap); } /* * Miscellaneous support routines follow */ static void alpha_protection_init() { int prot, *kp, *up; kp = protection_codes[0]; up = protection_codes[1]; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: *kp++ = PG_ASM; *up++ = 0; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KRE; *up++ = PG_URE | PG_KRE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = PG_ASM | PG_KWE; *up++ = PG_UWE | PG_KWE; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_ASM | PG_KWE | PG_KRE; *up++ = PG_UWE | PG_URE | PG_KWE | PG_KRE; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { return (void*) ALPHA_PHYS_TO_K0SEG(pa); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *pte; int val = 0; pte = pmap_lev3pte(pmap, addr); if (pte == 0) { return 0; } if (pmap_pte_v(pte)) { vm_page_t m; vm_offset_t pa; val = MINCORE_INCORE; if ((*pte & PG_MANAGED) == 0) return val; pa = pmap_pte_pa(pte); m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if ((*pte & PG_FOW) == 0) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if ((*pte & (PG_FOR | PG_FOE)) == 0) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); critical_enter(); if (pmap_active[PCPU_GET(cpuid)] && pmap != pmap_active[PCPU_GET(cpuid)]) { atomic_clear_32(&pmap_active[PCPU_GET(cpuid)]->pm_active, PCPU_GET(cpumask)); pmap_active[PCPU_GET(cpuid)] = 0; } td->td_pcb->pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t) pmap->pm_lev1) >> PAGE_SHIFT; if (pmap->pm_asn[PCPU_GET(cpuid)].gen != PCPU_GET(current_asngen)) pmap_get_asn(pmap); pmap_active[PCPU_GET(cpuid)] = pmap; atomic_set_32(&pmap->pm_active, PCPU_GET(cpumask)); td->td_pcb->pcb_hw.apcb_asn = pmap->pm_asn[PCPU_GET(cpuid)].asn; critical_exit(); if (td == curthread) { alpha_pal_swpctx((u_long)td->td_md.md_pcbpaddr); } } void pmap_deactivate(struct thread *td) { pmap_t pmap; pmap = vmspace_pmap(td->td_proc->p_vmspace); atomic_clear_32(&pmap->pm_active, PCPU_GET(cpumask)); pmap_active[PCPU_GET(cpuid)] = 0; } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return addr; } #if 0 #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPG; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte_quick(pmap, va); if (pte && pmap_pte_v(pte)) { vm_offset_t pa; vm_page_t m; pa = *(int *)pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { int i, j; vm_offset_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPG; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif #endif Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 132081) +++ head/sys/amd64/amd64/pmap.c (revision 132082) @@ -1,2905 +1,2906 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_msgbuf.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int nkpt; static int ndmpdp; static vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end; pt_entry_t pg_nx; static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; caddr_t CADDR1 = 0; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void pmap_clear_ptes(vm_page_t m, int bit); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance * by using a large (2MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return newaddr; } /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return va >> PDRSHIFT; } /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pde_index(vm_offset_t va) { return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pdpe_index(vm_offset_t va) { return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pml4e_index(vm_offset_t va) { return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { if (!pmap) return NULL; return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) return NULL; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return NULL; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *pte; pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return NULL; if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (PTmap + (amd64_btop(va) & mask)); } static u_int64_t allocpages(int n) { u_int64_t ret; ret = avail_start; bzero((void *)ret, n * PAGE_SIZE); avail_start += n * PAGE_SIZE; return (ret); } static void create_pagetables(void) { int i; /* Allocate pages */ KPTphys = allocpages(NKPT); KPML4phys = allocpages(1); KPDPphys = allocpages(NKPML4E); KPDphys = allocpages(NKPDPE); ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; DMPDPphys = allocpages(NDMPML4E); DMPDphys = allocpages(ndmpdp); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Fill in the underlying page table pages */ /* Read-only from zero to physfree */ /* XXX not fully used, underneath 2M pages */ for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) { ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; } /* Now map the page tables at their location within PTmap */ for (i = 0; i < NKPT; i++) { ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; } /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < avail_start; i++) { ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; } /* And connect up the PD to the PDP */ for (i = 0; i < NKPDPE; i++) { ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; } /* Now set up the direct map space using 2MB pages */ for (i = 0; i < NPDEPG * ndmpdp; i++) { ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; } /* And the direct map space's PDP */ for (i = 0; i < ndmpdp; i++) { ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; /* Connect the Direct Map slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; /* Connect the KVA slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr) vm_paddr_t *firstaddr; { vm_offset_t va; pt_entry_t *pte, *unused; avail_start = *firstaddr; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(); *firstaddr = avail_start; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* XXX do %cr0 as well */ load_cr4(rcr4() | CR4_PGE | CR4_PSE); load_cr3(KPML4phys); /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1 is only used for the memory test. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) virtual_avail = va; *CMAP1 = 0; invltlb(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; if (smp_started) { if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; if (smp_started) { if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; if (smp_started) { if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } #endif /* !SMP */ /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde, *pdep; rtval = 0; if (pmap == NULL) return (rtval); PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL) { pde = *pdep; if (pde) { if ((pde & PG_PS) != 0) { rtval = (pde & ~PDRMASK) | (va & PDRMASK); PMAP_UNLOCK(pmap); return rtval; } pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte; vm_page_t m; m = NULL; if (pmap == NULL) return (m); vm_page_lock_queues(); PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) | (va & PDRMASK)); vm_page_hold(m); } } else { pte = *pmap_pte(pmap, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pde = pmap_pde(kernel_pmap, va); if (*pde & PG_PS) { pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1)); } else { pa = *vtopte(va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return pa; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | PG_G); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE)); *pml4 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex); *pd = 0; } --pmap->pm_stats.resident_count; if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); vm_page_unhold(pdpg); if (pdpg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, pdpg); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); vm_page_unhold(pdppg); if (pdppg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, pdppg); } if (pmap_is_current(pmap)) { /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); } /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, va, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { if (va >= VM_MAXUSER_ADDRESS) return 0; return pmap_unwire_pte_hold(pmap, va, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t pml4pg; static vm_pindex_t color; PMAP_LOCK_INIT(pmap); /* * allocate the page directory page */ while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; /* install self-referential address mapping entry(s) */ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * this routine is called if the page table page is not * mapped correctly. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; vm_pindex_t ptepindex; { vm_page_t m, pdppg, pdpg; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { VM_WAIT; /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index) == NULL) { vm_page_lock_queues(); vm_page_unhold(m); vm_page_free(m); vm_page_unlock_queues(); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->hold_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex) == NULL) { vm_page_lock_queues(); vm_page_unhold(m); vm_page_free(m); vm_page_unlock_queues(); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex) == NULL) { vm_page_lock_queues(); vm_page_unhold(m); vm_page_free(m); vm_page_unlock_queues(); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->hold_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { vm_pindex_t ptepindex; pd_entry_t *pd; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { *pd = 0; pd = 0; pmap_invalidate_all(kernel_pmap); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != 0 && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->hold_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex); if (m == NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); pmap->pm_pml4[KPML4I] = 0; /* KVA */ pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */ pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ vm_page_lock_queues(); m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); vm_page_unlock_queues(); PMAP_LOCK_DESTROY(pmap); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t newpdp; mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); while (kernel_vm_end < addr) { pde = pmap_pde(kernel_pmap, kernel_vm_end); if (pde == NULL) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdp = (pdp_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); *pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp; continue; /* try again */ } if ((*pde & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) { pv_entry_t pv; pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; vm_page_lock_queues(); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; vm_page_unlock_queues(); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { mpte = PHYS_TO_VM_PAGE(*pmap_pde(pmap, va) & PG_FRAME); return pmap_unuse_pt(pmap, va, mpte); } } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte = pmap_pte(pmap, va); if (pte == NULL || (*pte & PG_V) == 0) return; pmap_remove_pte(pmap, pte, va); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte; int anyvalid; if (pmap == NULL) return; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva); PMAP_UNLOCK(pmap); return; } } anyvalid = 0; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if (pml4e == 0) { va_next = (sva + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pdpe(pmap, sva); if (pdpe == 0) { va_next = (sva + NBPDP) & ~PDPMASK; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; pde = pmap_pde(pmap, sva); if (pde == 0) continue; ptpaddr = *pde; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { *pde = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; for (; sva != va_next; sva += PAGE_SIZE) { pte = pmap_pte(pmap, sva); if (pte == NULL || *pte == 0) continue; anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva)) break; } } if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { PMAP_LOCK(pv->pv_pmap); pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte(pv->pv_pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); PMAP_UNLOCK(pv->pv_pmap); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if (pml4e == 0) { va_next = (sva + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pdpe(pmap, sva); if (pdpe == 0) { va_next = (sva + NBPDP) & ~PDPMASK; continue; } va_next = (sva + NBPDR) & ~PDRMASK; pde = pmap_pde(pmap, sva); if (pde == NULL) continue; ptpaddr = *pde; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { *pde &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged = 1; continue; } if (va_next > eva) va_next = eva; for (; sva != va_next; sva += PAGE_SIZE) { pt_entry_t pbits; pt_entry_t *pte; vm_page_t m; pte = pmap_pte(pmap, sva); if (pte == NULL) continue; pbits = *pte; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0 && pmap_track_modified(sva)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); pbits &= ~PG_M; } } pbits &= ~PG_RW; if (pbits != *pte) { pte_store(pte, pbits); anychanged = 1; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; register pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va = trunc_page(va); #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n", origpte, va); } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) panic("pmap_enter: invalid page directory va=%#lx\n", va); pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 2MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); PMAP_LOCK(pmap); err = pmap_remove_pte(pmap, pte, va); PMAP_UNLOCK(pmap); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%lx", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | PG_V); if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { pte_store(pte, newpte | PG_A); /*if (origpte)*/ { pmap_invalidate_page(pmap, va); } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 2MB page"); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); if (mpte == NULL) goto retry; } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, va, mpte); vm_page_unlock_queues(); } return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { vm_page_t p; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; int npdes; pd_entry_t ptepa, *pde; pde = pmap_pde(pmap, addr); if (pde != 0 && (*pde & PG_V) != 0) return; retry: p = vm_page_lookup(object, pindex); if (p != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; pde++; } pmap_invalidate_all(pmap); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (wired && (*pte & PG_W) == 0) { pmap->pm_stats.wired_count++; atomic_set_long(pte, PG_W); } else if (!wired && (*pte & PG_W) != 0) { pmap->pm_stats.wired_count--; atomic_clear_long(pte, PG_W); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t va_next; vm_page_t m; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; for (addr = src_addr; addr < end_addr; addr = va_next) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t srcptepaddr, *pde; vm_pindex_t ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pml4e = pmap_pml4e(src_pmap, addr); if (pml4e == 0) { va_next = (addr + NBPML4) & ~PML4MASK; continue; } pdpe = pmap_pdpe(src_pmap, addr); if (pdpe == 0) { va_next = (addr + NBPDP) & ~PDPMASK; continue; } va_next = (addr + NBPDR) & ~PDRMASK; ptepindex = pmap_pde_pindex(addr); pde = pmap_pde(src_pmap, addr); if (pde) srcptepaddr = *pde; else continue; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { pde = pmap_pde(dst_pmap, addr); if (pde == 0) { /* * XXX should do an allocpte here to * instantiate the pde */ continue; } if (*pde == 0) { *pde = srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY)) continue; if (va_next > end_addr) va_next = end_addr; src_pte = vtopte(addr); while (addr < va_next) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); dst_pte = pmap_pte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp & PG_FRAME); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); } else { vm_page_lock_queues(); pmap_unwire_pte_hold(dst_pmap, addr, dstmpte); vm_page_unlock_queues(); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; } } } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + vm_page_lock_queues(); PMAP_LOCK(pmap); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte(pmap, pv->pv_va); #endif tpte = *pte; if (tpte == 0) { printf("TPTE at %p IS ZERO @ VA %08lx\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; boolean_t rv; rv = FALSE; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rv); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_pte(pv->pv_pmap, pv->pv_va); rv = (*pte & PG_M) != 0; PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & PG_V)) { pte = vtopte(addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the given bit in each of the given page's ptes. */ static __inline void pmap_clear_ptes(vm_page_t m, int bit) { register pv_entry_t pv; pt_entry_t pbits, *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) return; mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (bit == PG_RW) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_pte(pv->pv_pmap, pv->pv_va); pbits = *pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } pte_store(pte, pbits & ~(PG_M|PG_RW)); } else { pte_store(pte, pbits & ~bit); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } if (bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_clear_ptes(m, PG_RW); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; pt_entry_t *pte; pt_entry_t v; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; PMAP_LOCK(pv->pv_pmap); pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { PMAP_UNLOCK(pv->pv_pmap); break; } } PMAP_UNLOCK(pv->pv_pmap); } while ((pv = pvn) != NULL && pv != pvf); } return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_clear_ptes(m, PG_M); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_clear_ptes(m, PG_A); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_paddr_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; /* If this fits within the direct map window, use it */ if (pa < dmaplimit && (pa + size) < dmaplimit) return ((void *)PHYS_TO_DMAP(pa)); offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = trunc_page(pa); for (tmpva = va; size > 0; ) { pmap_kenter(tmpva, pa); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset, tmpva; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; base = trunc_page(va); offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) pmap_kremove(tmpva); pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; PMAP_UNLOCK(pmap); if (pte != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { struct proc *p = td->td_proc; pmap_t pmap, oldpmap; u_int64_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #ifdef SMP if (oldpmap) /* XXX FIXME */ atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else if (oldpmap) /* XXX FIXME */ oldpmap->pm_active &= ~PCPU_GET(cpumask); pmap->pm_active |= PCPU_GET(cpumask); #endif cr3 = vtophys(pmap->pm_pml4); /* XXXKSE this is wrong. * pmap_activate is for the current thread on the current cpu */ if (p->p_flag & P_SA) { /* Make sure all other cr3 entries are updated. */ /* what if they are running? XXXKSE (maybe abort them) */ FOREACH_THREAD_IN_PROC(p, td) { td->td_pcb->pcb_cr3 = cr3; } } else { td->td_pcb->pcb_cr3 = cr3; } load_cr3(cr3); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } Index: head/sys/arm/arm/pmap.c =================================================================== --- head/sys/arm/arm/pmap.c (revision 132081) +++ head/sys/arm/arm/pmap.c (revision 132082) @@ -1,4649 +1,4648 @@ /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */ /* * Copyright 2004 Olivier Houchard. * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2002-2003 Wasabi Systems, Inc. * Copyright (c) 2001 Richard Earnshaw * Copyright (c) 2001-2002 Christopher Gilbert * All rights reserved. * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1999 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Charles M. Hannum. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the NetBSD * Foundation, Inc. and its contributors. * 4. Neither the name of The NetBSD Foundation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1994-1998 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Mark Brinicombe. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * * RiscBSD kernel project * * pmap.c * * Machine dependant vm stuff * * Created : 20/09/94 */ /* * Special compilation symbols * PMAP_DEBUG - Build in pmap_debug_level code */ /* Include header files */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PMAP_DEBUG #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 0; #define PMAP_INLINE #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #define PMAP_INLINE #endif /* PMAP_DEBUG */ /* * Get PDEs and PTEs for user/kernel address space */ #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDR_SHIFT]) #define pmap_pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; extern struct pv_addr systempage; /* * Internal function prototypes */ static PMAP_INLINE void pmap_invalidate_page (pmap_t, vm_offset_t); #if 0 static PMAP_INLINE void pmap_invalidate_tlb (pmap_t, vm_offset_t); #endif static PMAP_INLINE void pmap_invalidate_tlb_all (pmap_t); static PMAP_INLINE void pmap_changebit (vm_page_t, int, boolean_t); static PMAP_INLINE int pmap_track_modified(vm_offset_t); static pt_entry_t * pmap_pte (pmap_t, vm_offset_t); static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t); static PMAP_INLINE void pmap_free_pv_entry (pv_entry_t); static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t, vm_page_t); static void arm_protection_init(void); static pv_entry_t pmap_get_pv_entry(void); static void pmap_vac_me_harder(struct vm_page *, pmap_t, vm_offset_t); static void pmap_vac_me_kpmap(struct vm_page *, pmap_t, vm_offset_t); static void pmap_vac_me_user(struct vm_page *, pmap_t, vm_offset_t); static void pmap_alloc_l1(pmap_t); static void pmap_free_l1(pmap_t); static void pmap_use_l1(pmap_t); static PMAP_INLINE boolean_t pmap_is_current(pmap_t); static PMAP_INLINE boolean_t pmap_is_cached(pmap_t); static void pmap_clearbit(struct vm_page *, u_int); static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t); static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t); static void pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int); static vm_offset_t kernel_pt_lookup(vm_paddr_t); static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1"); vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ extern void *end; vm_offset_t kernel_vm_end = 0; struct pmap kernel_pmap_store; pmap_t kernel_pmap; static pt_entry_t *csrc_pte, *cdst_pte; static vm_offset_t csrcp, cdstp; static void pmap_init_l1(struct l1_ttable *, pd_entry_t *); /* * These routines are called when the CPU type is identified to set up * the PTE prototypes, cache modes, etc. * * The variables are always here, just in case LKMs need to reference * them (though, they shouldn't). */ pt_entry_t pte_l1_s_cache_mode; pt_entry_t pte_l1_s_cache_mode_pt; pt_entry_t pte_l1_s_cache_mask; pt_entry_t pte_l2_l_cache_mode; pt_entry_t pte_l2_l_cache_mode_pt; pt_entry_t pte_l2_l_cache_mask; pt_entry_t pte_l2_s_cache_mode; pt_entry_t pte_l2_s_cache_mode_pt; pt_entry_t pte_l2_s_cache_mask; pt_entry_t pte_l2_s_prot_u; pt_entry_t pte_l2_s_prot_w; pt_entry_t pte_l2_s_prot_mask; pt_entry_t pte_l1_s_proto; pt_entry_t pte_l1_c_proto; pt_entry_t pte_l2_s_proto; void (*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t); void (*pmap_zero_page_func)(vm_paddr_t, int, int); /* * Which pmap is currently 'live' in the cache * * XXXSCW: Fix for SMP ... */ union pmap_cache_state *pmap_cache_state; LIST_HEAD(pmaplist, pmap); struct pmaplist allpmaps; static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ /* static pt_entry_t *msgbufmap;*/ struct msgbuf *msgbufp = 0; extern void bcopy_page(vm_offset_t, vm_offset_t); extern void bzero_page(vm_offset_t); /* * Metadata for L1 translation tables. */ struct l1_ttable { /* Entry on the L1 Table list */ SLIST_ENTRY(l1_ttable) l1_link; /* Entry on the L1 Least Recently Used list */ TAILQ_ENTRY(l1_ttable) l1_lru; /* Track how many domains are allocated from this L1 */ volatile u_int l1_domain_use_count; /* * A free-list of domain numbers for this L1. * We avoid using ffs() and a bitmap to track domains since ffs() * is slow on ARM. */ u_int8_t l1_domain_first; u_int8_t l1_domain_free[PMAP_DOMAINS]; /* Physical address of this L1 page table */ vm_paddr_t l1_physaddr; /* KVA of this L1 page table */ pd_entry_t *l1_kva; }; /* * Convert a virtual address into its L1 table index. That is, the * index used to locate the L2 descriptor table pointer in an L1 table. * This is basically used to index l1->l1_kva[]. * * Each L2 descriptor table represents 1MB of VA space. */ #define L1_IDX(va) (((vm_offset_t)(va)) >> L1_S_SHIFT) /* * L1 Page Tables are tracked using a Least Recently Used list. * - New L1s are allocated from the HEAD. * - Freed L1s are added to the TAIl. * - Recently accessed L1s (where an 'access' is some change to one of * the userland pmaps which owns this L1) are moved to the TAIL. */ static TAILQ_HEAD(, l1_ttable) l1_lru_list; static struct mtx l1_lru_lock; /* * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots. * * This is normally 16MB worth L2 page descriptors for any given pmap. * Reference counts are maintained for L2 descriptors so they can be * freed when empty. */ struct l2_dtable { /* The number of L2 page descriptors allocated to this l2_dtable */ u_int l2_occupancy; /* List of L2 page descriptors */ struct l2_bucket { pt_entry_t *l2b_kva; /* KVA of L2 Descriptor Table */ vm_paddr_t l2b_phys; /* Physical address of same */ u_short l2b_l1idx; /* This L2 table's L1 index */ u_short l2b_occupancy; /* How many active descriptors */ } l2_bucket[L2_BUCKET_SIZE]; }; /* * Given an L1 table index, calculate the corresponding l2_dtable index * and bucket index within the l2_dtable. */ #define L2_IDX(l1idx) (((l1idx) >> L2_BUCKET_LOG2) & \ (L2_SIZE - 1)) #define L2_BUCKET(l1idx) ((l1idx) & (L2_BUCKET_SIZE - 1)) /* * Given a virtual address, this macro returns the * virtual address required to drop into the next L2 bucket. */ #define L2_NEXT_BUCKET(va) (((va) & L1_S_FRAME) + L1_S_SIZE) /* * L2 allocation. */ #define pmap_alloc_l2_dtable() \ (void*)uma_zalloc(l2table_zone, M_NOWAIT) #define pmap_free_l2_dtable(l2) \ uma_zfree(l2table_zone, l2) /* * We try to map the page tables write-through, if possible. However, not * all CPUs have a write-through cache mode, so on those we have to sync * the cache when we frob page tables. * * We try to evaluate this at compile time, if possible. However, it's * not always possible to do that, hence this run-time var. */ int pmap_needs_pte_sync; /* * Macro to determine if a mapping might be resident in the * instruction cache and/or TLB */ #define PV_BEEN_EXECD(f) (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC)) /* * Macro to determine if a mapping might be resident in the * data cache and/or TLB */ #define PV_BEEN_REFD(f) (((f) & PVF_REF) != 0) /* * Cache enable bits in PTE to use on pages that are cacheable. * On most machines this is cacheable/bufferable, but on some, eg arm10, we * can chose between write-through and write-back cacheing. */ pt_entry_t pte_cache_mode = (PT_C | PT_B); /* * Data for the pv entry allocation mechanism */ #define MINPV 1024 #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif static uma_zone_t pvzone; static uma_zone_t l2zone; static uma_zone_t l2table_zone; static struct vm_object pvzone_obj; static struct vm_object l2zone_obj; static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; int pmap_pagedaemon_waken = 0; void pmap_deactivate(struct thread *); void pmap_deactivate(struct thread *td) { } /* * This list exists for the benefit of pmap_map_chunk(). It keeps track * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can * find them as necessary. * * Note that the data on this list MUST remain valid after initarm() returns, * as pmap_bootstrap() uses it to contruct L2 table metadata. */ SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list); static void pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt) { int i; l1->l1_kva = l1pt; l1->l1_domain_use_count = 0; l1->l1_domain_first = 0; for (i = 0; i < PMAP_DOMAINS; i++) l1->l1_domain_free[i] = i + 1; /* * Copy the kernel's L1 entries to each new L1. */ if (pmap_initialized) memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE); if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0) panic("pmap_init_l1: can't get PA of L1 at %p", l1pt); if (l1->l1_physaddr & (L1_TABLE_SIZE - 1)) panic("fuck\n"); TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); } static vm_offset_t kernel_pt_lookup(vm_paddr_t pa) { struct pv_addr *pv; SLIST_FOREACH(pv, &kernel_pt_list, pv_list) { #ifndef ARM32_NEW_VM_LAYOUT if (pv->pv_pa == (pa & ~PAGE_MASK)) { return (pv->pv_va | (pa & PAGE_MASK)); } #else if (pv->pv_pa == pa) return (pv->pv_va); #endif } return (0); } #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 void pmap_pte_init_generic(void) { pte_l1_s_cache_mode = L1_S_B|L1_S_C; pte_l1_s_cache_mask = L1_S_CACHE_MASK_generic; pte_l2_l_cache_mode = L2_B|L2_C; pte_l2_l_cache_mask = L2_L_CACHE_MASK_generic; pte_l2_s_cache_mode = L2_B|L2_C; pte_l2_s_cache_mask = L2_S_CACHE_MASK_generic; /* * If we have a write-through cache, set B and C. If * we have a write-back cache, then we assume setting * only C will make those pages write-through. */ if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) { pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C; pte_l2_l_cache_mode_pt = L2_B|L2_C; pte_l2_s_cache_mode_pt = L2_B|L2_C; } else { pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } pte_l2_s_prot_u = L2_S_PROT_U_generic; pte_l2_s_prot_w = L2_S_PROT_W_generic; pte_l2_s_prot_mask = L2_S_PROT_MASK_generic; pte_l1_s_proto = L1_S_PROTO_generic; pte_l1_c_proto = L1_C_PROTO_generic; pte_l2_s_proto = L2_S_PROTO_generic; pmap_copy_page_func = pmap_copy_page_generic; pmap_zero_page_func = pmap_zero_page_generic; } #if defined(CPU_ARM8) void pmap_pte_init_arm8(void) { /* * ARM8 is compatible with generic, but we need to use * the page tables uncached. */ pmap_pte_init_generic(); pte_l1_s_cache_mode_pt = 0; pte_l2_l_cache_mode_pt = 0; pte_l2_s_cache_mode_pt = 0; } #endif /* CPU_ARM8 */ #if defined(CPU_ARM9) && defined(ARM9_CACHE_WRITE_THROUGH) void pmap_pte_init_arm9(void) { /* * ARM9 is compatible with generic, but we want to use * write-through caching for now. */ pmap_pte_init_generic(); pte_l1_s_cache_mode = L1_S_C; pte_l2_l_cache_mode = L2_C; pte_l2_s_cache_mode = L2_C; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } #endif /* CPU_ARM9 */ #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if defined(CPU_ARM10) void pmap_pte_init_arm10(void) { /* * ARM10 is compatible with generic, but we want to use * write-through caching for now. */ pmap_pte_init_generic(); pte_l1_s_cache_mode = L1_S_B | L1_S_C; pte_l2_l_cache_mode = L2_B | L2_C; pte_l2_s_cache_mode = L2_B | L2_C; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; } #endif /* CPU_ARM10 */ #if ARM_MMU_SA1 == 1 void pmap_pte_init_sa1(void) { /* * The StrongARM SA-1 cache does not have a write-through * mode. So, do the generic initialization, then reset * the page table cache mode to B=1,C=1, and note that * the PTEs need to be sync'd. */ pmap_pte_init_generic(); pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C; pte_l2_l_cache_mode_pt = L2_B|L2_C; pte_l2_s_cache_mode_pt = L2_B|L2_C; pmap_needs_pte_sync = 1; } #endif /* ARM_MMU_SA1 == 1*/ #if ARM_MMU_XSCALE == 1 #if (ARM_NMMUS > 1) static u_int xscale_use_minidata; #endif void pmap_pte_init_xscale(void) { uint32_t auxctl; int write_through = 0; pte_l1_s_cache_mode = L1_S_B|L1_S_C; pte_l1_s_cache_mask = L1_S_CACHE_MASK_xscale; pte_l2_l_cache_mode = L2_B|L2_C; pte_l2_l_cache_mask = L2_L_CACHE_MASK_xscale; pte_l2_s_cache_mode = L2_B|L2_C; pte_l2_s_cache_mask = L2_S_CACHE_MASK_xscale; pte_l1_s_cache_mode_pt = L1_S_C; pte_l2_l_cache_mode_pt = L2_C; pte_l2_s_cache_mode_pt = L2_C; #ifdef XSCALE_CACHE_READ_WRITE_ALLOCATE /* * The XScale core has an enhanced mode where writes that * miss the cache cause a cache line to be allocated. This * is significantly faster than the traditional, write-through * behavior of this case. */ pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_X); pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_X); pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_X); #endif /* XSCALE_CACHE_READ_WRITE_ALLOCATE */ #ifdef XSCALE_CACHE_WRITE_THROUGH /* * Some versions of the XScale core have various bugs in * their cache units, the work-around for which is to run * the cache in write-through mode. Unfortunately, this * has a major (negative) impact on performance. So, we * go ahead and run fast-and-loose, in the hopes that we * don't line up the planets in a way that will trip the * bugs. * * However, we give you the option to be slow-but-correct. */ write_through = 1; #elif defined(XSCALE_CACHE_WRITE_BACK) /* force write back cache mode */ write_through = 0; #elif defined(CPU_XSCALE_PXA2X0) /* * Intel PXA2[15]0 processors are known to have a bug in * write-back cache on revision 4 and earlier (stepping * A[01] and B[012]). Fixed for C0 and later. */ { uint32_t id, type; id = cpufunc_id(); type = id & ~(CPU_ID_XSCALE_COREREV_MASK|CPU_ID_REVISION_MASK); if (type == CPU_ID_PXA250 || type == CPU_ID_PXA210) { if ((id & CPU_ID_REVISION_MASK) < 5) { /* write through for stepping A0-1 and B0-2 */ write_through = 1; } } } #endif /* XSCALE_CACHE_WRITE_THROUGH */ if (write_through) { pte_l1_s_cache_mode = L1_S_C; pte_l2_l_cache_mode = L2_C; pte_l2_s_cache_mode = L2_C; } #if (ARM_NMMUS > 1) xscale_use_minidata = 1; #endif pte_l2_s_prot_u = L2_S_PROT_U_xscale; pte_l2_s_prot_w = L2_S_PROT_W_xscale; pte_l2_s_prot_mask = L2_S_PROT_MASK_xscale; pte_l1_s_proto = L1_S_PROTO_xscale; pte_l1_c_proto = L1_C_PROTO_xscale; pte_l2_s_proto = L2_S_PROTO_xscale; pmap_copy_page_func = pmap_copy_page_xscale; pmap_zero_page_func = pmap_zero_page_xscale; /* * Disable ECC protection of page table access, for now. */ __asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl)); auxctl &= ~XSCALE_AUXCTL_P; __asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl)); } /* * xscale_setup_minidata: * * Set up the mini-data cache clean area. We require the * caller to allocate the right amount of physically and * virtually contiguous space. */ extern vm_offset_t xscale_minidata_clean_addr; extern vm_size_t xscale_minidata_clean_size; /* already initialized */ void xscale_setup_minidata(vm_offset_t l1pt, vm_offset_t va, vm_paddr_t pa) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t *pte; vm_size_t size; uint32_t auxctl; xscale_minidata_clean_addr = va; /* Round it to page size. */ size = (xscale_minidata_clean_size + L2_S_OFFSET) & L2_S_FRAME; for (; size != 0; va += L2_S_SIZE, pa += L2_S_SIZE, size -= L2_S_SIZE) { #ifndef ARM32_NEW_VM_LAYOUT pte = (pt_entry_t *) kernel_pt_lookup(pde[va >> L1_S_SHIFT] & L2_S_FRAME); #else pte = (pt_entry_t *) kernel_pt_lookup( pde[L1_IDX(va)] & L1_C_ADDR_MASK); #endif if (pte == NULL) panic("xscale_setup_minidata: can't find L2 table for " "VA 0x%08x", (u_int32_t) va); #ifndef ARM32_NEW_VM_LAYOUT pte[(va >> PAGE_SHIFT) & 0x3ff] = #else pte[l2pte_index(va)] = #endif L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); } /* * Configure the mini-data cache for write-back with * read/write-allocate. * * NOTE: In order to reconfigure the mini-data cache, we must * make sure it contains no valid data! In order to do that, * we must issue a global data cache invalidate command! * * WE ASSUME WE ARE RUNNING UN-CACHED WHEN THIS ROUTINE IS CALLED! * THIS IS VERY IMPORTANT! */ /* Invalidate data and mini-data. */ __asm __volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0)); __asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl)); auxctl = (auxctl & ~XSCALE_AUXCTL_MD_MASK) | XSCALE_AUXCTL_MD_WB_RWA; __asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl)); } #endif /* * Allocate an L1 translation table for the specified pmap. * This is called at pmap creation time. */ static void pmap_alloc_l1(pmap_t pm) { struct l1_ttable *l1; u_int8_t domain; /* * Remove the L1 at the head of the LRU list */ mtx_lock(&l1_lru_lock); l1 = TAILQ_FIRST(&l1_lru_list); TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Pick the first available domain number, and update * the link to the next number. */ domain = l1->l1_domain_first; l1->l1_domain_first = l1->l1_domain_free[domain]; /* * If there are still free domain numbers in this L1, * put it back on the TAIL of the LRU list. */ if (++l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); /* * Fix up the relevant bits in the pmap structure */ pm->pm_l1 = l1; pm->pm_domain = domain; } /* * Free an L1 translation table. * This is called at pmap destruction time. */ static void pmap_free_l1(pmap_t pm) { struct l1_ttable *l1 = pm->pm_l1; mtx_lock(&l1_lru_lock); /* * If this L1 is currently on the LRU list, remove it. */ if (l1->l1_domain_use_count < PMAP_DOMAINS) TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); /* * Free up the domain number which was allocated to the pmap */ l1->l1_domain_free[pm->pm_domain] = l1->l1_domain_first; l1->l1_domain_first = pm->pm_domain; l1->l1_domain_use_count--; /* * The L1 now must have at least 1 free domain, so add * it back to the LRU list. If the use count is zero, * put it at the head of the list, otherwise it goes * to the tail. */ if (l1->l1_domain_use_count == 0) { TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru); } else TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); } static PMAP_INLINE void pmap_use_l1(pmap_t pm) { struct l1_ttable *l1; /* * Do nothing if we're in interrupt context. * Access to an L1 by the kernel pmap must not affect * the LRU list. */ if (pm == pmap_kernel()) return; l1 = pm->pm_l1; /* * If the L1 is not currently on the LRU list, just return */ if (l1->l1_domain_use_count == PMAP_DOMAINS) return; mtx_lock(&l1_lru_lock); /* * Check the use count again, now that we've acquired the lock */ if (l1->l1_domain_use_count == PMAP_DOMAINS) { mtx_unlock(&l1_lru_lock); return; } /* * Move the L1 to the back of the LRU list */ TAILQ_REMOVE(&l1_lru_list, l1, l1_lru); TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru); mtx_unlock(&l1_lru_lock); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA, or NULL if no L2 bucket exists for the address. */ static PMAP_INLINE struct l2_bucket * pmap_get_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL || (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL) return (NULL); return (l2b); } /* * Returns a pointer to the L2 bucket associated with the specified pmap * and VA. * * If no L2 bucket exists, perform the necessary allocations to put an L2 * bucket/page table in place. * * Note that if a new L2 bucket/page was allocated, the caller *must* * increment the bucket occupancy counter appropriately *before* * releasing the pmap's lock to ensure no other thread or cpu deallocates * the bucket/page in the meantime. */ static struct l2_bucket * pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; struct l2_bucket *l2b; u_short l1idx; l1idx = L1_IDX(va); if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) { /* * No mapping at this address, as there is * no entry in the L1 table. * Need to allocate a new l2_dtable. */ if ((l2 = pmap_alloc_l2_dtable()) == NULL) { return (NULL); } bzero(l2, sizeof(*l2)); /* * Link it into the parent pmap */ pm->pm_l2[L2_IDX(l1idx)] = l2; bzero(l2, sizeof( struct l2_dtable)); } l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; /* * Fetch pointer to the L2 page table associated with the address. */ if (l2b->l2b_kva == NULL) { pt_entry_t *ptep; /* * No L2 page table has been allocated. Chances are, this * is because we just allocated the l2_dtable, above. */ ptep = (void*)uma_zalloc(l2zone, M_NOWAIT); l2b->l2b_phys = vtophys(ptep); if (ptep == NULL) { /* * Oops, no more L2 page tables available at this * time. We may need to deallocate the l2_dtable * if we allocated a new one above. */ if (l2->l2_occupancy == 0) { pm->pm_l2[L2_IDX(l1idx)] = NULL; pmap_free_l2_dtable(l2); } return (NULL); } l2->l2_occupancy++; l2b->l2b_kva = ptep; l2b->l2b_l1idx = l1idx; } return (l2b); } static PMAP_INLINE void #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(pt_entry_t *l2) #else pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2) #endif { #ifdef PMAP_INCLUDE_PTE_SYNC /* * Note: With a write-back cache, we may need to sync this * L2 table before re-using it. * This is because it may have belonged to a non-current * pmap, in which case the cache syncs would have been * skipped when the pages were being unmapped. If the * L2 table were then to be immediately re-allocated to * the *current* pmap, it may well contain stale mappings * which have not yet been cleared by a cache write-back * and so would still be visible to the mmu. */ if (need_sync) PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); #endif uma_zfree(l2zone, l2); } /* * One or more mappings in the specified L2 descriptor table have just been * invalidated. * * Garbage collect the metadata and descriptor table itself if necessary. * * The pmap lock must be acquired when this is called (not necessary * for the kernel pmap). */ static void pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; /* * Update the bucket's reference count according to how many * PTEs the caller has just invalidated. */ l2b->l2b_occupancy -= count; /* * Note: * * Level 2 page tables allocated to the kernel pmap are never freed * as that would require checking all Level 1 page tables and * removing any references to the Level 2 page table. See also the * comment elsewhere about never freeing bootstrap L2 descriptors. * * We make do with just invalidating the mapping in the L2 table. * * This isn't really a big deal in practice and, in fact, leads * to a performance win over time as we don't need to continually * alloc/free. */ if (l2b->l2b_occupancy > 0 || pm == pmap_kernel()) return; /* * There are no more valid mappings in this level 2 page table. * Go ahead and NULL-out the pointer in the bucket, then * free the page table. */ l1idx = l2b->l2b_l1idx; ptep = l2b->l2b_kva; l2b->l2b_kva = NULL; pl1pd = &pm->pm_l1->l1_kva[l1idx]; /* * If the L1 slot matches the pmap's domain * number, then invalidate it. */ l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK); if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) { *pl1pd = 0; PTE_SYNC(pl1pd); } /* * Release the L2 descriptor table back to the pool cache. */ #ifndef PMAP_INCLUDE_PTE_SYNC pmap_free_l2_ptp(ptep); #else pmap_free_l2_ptp(!pmap_is_cached(pm), ptep); #endif /* * Update the reference count in the associated l2_dtable */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (--l2->l2_occupancy > 0) return; /* * There are no more valid mappings in any of the Level 1 * slots managed by this l2_dtable. Go ahead and NULL-out * the pointer in the parent pmap and free the l2_dtable. */ pm->pm_l2[L2_IDX(l1idx)] = NULL; pmap_free_l2_dtable(l2); } /* * Pool cache constructors for L2 descriptor tables, metadata and pmap * structures. */ static void pmap_l2ptp_ctor(void *mem, int size, void *arg) { #ifndef PMAP_INCLUDE_PTE_SYNC struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK; /* * The mappings for these page tables were initially made using * pmap_kenter_pa() by the pool subsystem. Therefore, the cache- * mode will not be right for page table mappings. To avoid * polluting the pmap_kenter_pa() code with a special case for * page tables, we simply fix up the cache-mode here if it's not * correct. */ l2b = pmap_get_l2_bucket(pmap_kernel(), va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { /* * Page tables must have the cache-mode set to Write-Thru. */ *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); cpu_cpwait(); } #endif memset(mem, 0, L2_TABLE_SIZE_REAL); PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); } /* * A bunch of routines to conditionally flush the caches/TLB depending * on whether the specified pmap actually needs to be flushed at any * given time. */ static PMAP_INLINE void pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va) { if (pm->pm_cstate.cs_tlb_id) cpu_tlb_flushID_SE(va); } static PMAP_INLINE void pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va) { if (pm->pm_cstate.cs_tlb_d) cpu_tlb_flushD_SE(va); } static PMAP_INLINE void pmap_tlb_flushID(pmap_t pm) { if (pm->pm_cstate.cs_tlb_id) { cpu_tlb_flushID(); pm->pm_cstate.cs_tlb = 0; } } static PMAP_INLINE void pmap_tlb_flushD(pmap_t pm) { if (pm->pm_cstate.cs_tlb_d) { cpu_tlb_flushD(); pm->pm_cstate.cs_tlb_d = 0; } } static PMAP_INLINE void pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len) { if (pm->pm_cstate.cs_cache_id) cpu_idcache_wbinv_range(va, len); } static PMAP_INLINE void pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len, boolean_t do_inv, boolean_t rd_only) { if (pm->pm_cstate.cs_cache_d) { if (do_inv) { if (rd_only) cpu_dcache_inv_range(va, len); else cpu_dcache_wbinv_range(va, len); } else if (!rd_only) cpu_dcache_wb_range(va, len); } } static PMAP_INLINE void pmap_idcache_wbinv_all(pmap_t pm) { if (pm->pm_cstate.cs_cache_id) { cpu_idcache_wbinv_all(); pm->pm_cstate.cs_cache = 0; } } static PMAP_INLINE void pmap_dcache_wbinv_all(pmap_t pm) { if (pm->pm_cstate.cs_cache_d) { cpu_dcache_wbinv_all(); pm->pm_cstate.cs_cache_d = 0; } } static PMAP_INLINE boolean_t pmap_is_current(pmap_t pm) { if (pm == pmap_kernel() || (curproc && curproc->p_vmspace->vm_map.pmap == pm)) return (TRUE); return (FALSE); } static PMAP_INLINE boolean_t pmap_is_cached(pmap_t pm) { if (pm == pmap_kernel() || pmap_cache_state == NULL || pmap_cache_state == &pm->pm_cstate) return (TRUE); return (FALSE); } /* * PTE_SYNC_CURRENT: * * Make sure the pte is written out to RAM. * We need to do this for one of two cases: * - We're dealing with the kernel pmap * - There is no pmap active in the cache/tlb. * - The specified pmap is 'active' in the cache/tlb. */ #ifdef PMAP_INCLUDE_PTE_SYNC #define PTE_SYNC_CURRENT(pm, ptep) \ do { \ if (PMAP_NEEDS_PTE_SYNC && \ pmap_is_cached(pm)) \ PTE_SYNC(ptep); \ } while (/*CONSTCOND*/0) #else #define PTE_SYNC_CURRENT(pm, ptep) /* nothing */ #endif /* * Since we have a virtually indexed cache, we may need to inhibit caching if * there is more than one mapping and at least one of them is writable. * Since we purge the cache on every context switch, we only need to check for * other mappings within the same pmap, or kernel_pmap. * This function is also called when a page is unmapped, to possibly reenable * caching on any remaining mappings. * * The code implements the following logic, where: * * KW = # of kernel read/write pages * KR = # of kernel read only pages * UW = # of user read/write pages * UR = # of user read only pages * * KC = kernel mapping is cacheable * UC = user mapping is cacheable * * KW=0,KR=0 KW=0,KR>0 KW=1,KR=0 KW>1,KR>=0 * +--------------------------------------------- * UW=0,UR=0 | --- KC=1 KC=1 KC=0 * UW=0,UR>0 | UC=1 KC=1,UC=1 KC=0,UC=0 KC=0,UC=0 * UW=1,UR=0 | UC=1 KC=0,UC=0 KC=0,UC=0 KC=0,UC=0 * UW>1,UR>=0 | UC=0 KC=0,UC=0 KC=0,UC=0 KC=0,UC=0 */ static const int pmap_vac_flags[4][4] = { {-1, 0, 0, PVF_KNC}, {0, 0, PVF_NC, PVF_NC}, {0, PVF_NC, PVF_NC, PVF_NC}, {PVF_UNC, PVF_NC, PVF_NC, PVF_NC} }; static PMAP_INLINE int pmap_get_vac_flags(const struct vm_page *pg) { int kidx, uidx; kidx = 0; if (pg->md.kro_mappings || pg->md.krw_mappings > 1) kidx |= 1; if (pg->md.krw_mappings) kidx |= 2; uidx = 0; if (pg->md.uro_mappings || pg->md.urw_mappings > 1) uidx |= 1; if (pg->md.urw_mappings) uidx |= 2; return (pmap_vac_flags[uidx][kidx]); } static __inline void pmap_vac_me_harder(struct vm_page *pg, pmap_t pm, vm_offset_t va) { int nattr; nattr = pmap_get_vac_flags(pg); if (nattr < 0) { pg->md.pvh_attrs &= ~PVF_NC; return; } if (nattr == 0 && (pg->md.pvh_attrs & PVF_NC) == 0) { return; } if (pm == pmap_kernel()) pmap_vac_me_kpmap(pg, pm, va); else pmap_vac_me_user(pg, pm, va); pg->md.pvh_attrs = (pg->md.pvh_attrs & ~PVF_NC) | nattr; } static void pmap_vac_me_kpmap(struct vm_page *pg, pmap_t pm, vm_offset_t va) { u_int u_cacheable, u_entries; struct pv_entry *pv; pmap_t last_pmap = pm; /* * Pass one, see if there are both kernel and user pmaps for * this page. Calculate whether there are user-writable or * kernel-writable pages. */ u_cacheable = 0; TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { if (pv->pv_pmap != pm && (pv->pv_flags & PVF_NC) == 0) u_cacheable++; } u_entries = pg->md.urw_mappings + pg->md.uro_mappings; /* * We know we have just been updating a kernel entry, so if * all user pages are already cacheable, then there is nothing * further to do. */ if (pg->md.k_mappings == 0 && u_cacheable == u_entries) return; if (u_entries) { /* * Scan over the list again, for each entry, if it * might not be set correctly, call pmap_vac_me_user * to recalculate the settings. */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* * We know kernel mappings will get set * correctly in other calls. We also know * that if the pmap is the same as last_pmap * then we've just handled this entry. */ if (pv->pv_pmap == pm || pv->pv_pmap == last_pmap) continue; /* * If there are kernel entries and this page * is writable but non-cacheable, then we can * skip this entry also. */ if (pg->md.k_mappings && (pv->pv_flags & (PVF_NC | PVF_WRITE)) == (PVF_NC | PVF_WRITE)) continue; /* * Similarly if there are no kernel-writable * entries and the page is already * read-only/cacheable. */ if (pg->md.krw_mappings == 0 && (pv->pv_flags & (PVF_NC | PVF_WRITE)) == 0) continue; /* * For some of the remaining cases, we know * that we must recalculate, but for others we * can't tell if they are correct or not, so * we recalculate anyway. */ pmap_vac_me_user(pg, (last_pmap = pv->pv_pmap), 0); } if (pg->md.k_mappings == 0) return; } pmap_vac_me_user(pg, pm, va); } static void pmap_vac_me_user(struct vm_page *pg, pmap_t pm, vm_offset_t va) { pmap_t kpmap = pmap_kernel(); struct pv_entry *pv, *npv; struct l2_bucket *l2b; pt_entry_t *ptep, pte; u_int entries = 0; u_int writable = 0; u_int cacheable_entries = 0; u_int kern_cacheable = 0; u_int other_writable = 0; /* * Count mappings and writable mappings in this pmap. * Include kernel mappings as part of our own. * Keep a pointer to the first one. */ npv = TAILQ_FIRST(&pg->md.pv_list); TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { /* Count mappings in the same pmap */ if (pm == pv->pv_pmap || kpmap == pv->pv_pmap) { if (entries++ == 0) npv = pv; /* Cacheable mappings */ if ((pv->pv_flags & PVF_NC) == 0) { cacheable_entries++; if (kpmap == pv->pv_pmap) kern_cacheable++; } /* Writable mappings */ if (pv->pv_flags & PVF_WRITE) ++writable; } else if (pv->pv_flags & PVF_WRITE) other_writable = 1; } /* * Enable or disable caching as necessary. * Note: the first entry might be part of the kernel pmap, * so we can't assume this is indicative of the state of the * other (maybe non-kpmap) entries. */ if ((entries > 1 && writable) || (entries > 0 && pm == kpmap && other_writable)) { if (cacheable_entries == 0) return; for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) { if ((pm != pv->pv_pmap && kpmap != pv->pv_pmap) || (pv->pv_flags & PVF_NC)) continue; pv->pv_flags |= PVF_NC; l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; pte = *ptep & ~L2_S_CACHE_MASK; if ((va != pv->pv_va || pm != pv->pv_pmap) && l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_idcache_wbinv_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE); pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, (pv->pv_flags & PVF_WRITE) == 0); pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } *ptep = pte; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); } cpu_cpwait(); } else if (entries > cacheable_entries) { /* * Turn cacheing back on for some pages. If it is a kernel * page, only do so if there are no other writable pages. */ for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) { if (!(pv->pv_flags & PVF_NC) || (pm != pv->pv_pmap && (kpmap != pv->pv_pmap || other_writable))) continue; pv->pv_flags &= ~PVF_NC; l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va); ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode; if (l2pte_valid(pte)) { if (PV_BEEN_EXECD(pv->pv_flags)) { pmap_tlb_flushID_SE(pv->pv_pmap, pv->pv_va); } else if (PV_BEEN_REFD(pv->pv_flags)) { pmap_tlb_flushD_SE(pv->pv_pmap, pv->pv_va); } } *ptep = pte; PTE_SYNC_CURRENT(pv->pv_pmap, ptep); } } } /* * Modify pte bits for all ptes corresponding to the given physical address. * We use `maskbits' rather than `clearbits' because we're always passing * constants and the latter would require an extra inversion at run-time. */ static void pmap_clearbit(struct vm_page *pg, u_int maskbits) { struct l2_bucket *l2b; struct pv_entry *pv; pt_entry_t *ptep, npte, opte; pmap_t pm; vm_offset_t va; u_int oflags; #if 0 PMAP_HEAD_TO_MAP_LOCK(); simple_lock(&pg->mdpage.pvh_slock); #endif /* * Clear saved attributes (modify, reference) */ pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF)); if (TAILQ_EMPTY(&pg->md.pv_list)) { #if 0 simple_unlock(&pg->mdpage.pvh_slock); PMAP_HEAD_TO_MAP_UNLOCK(); #endif return; } /* * Loop over all current mappings setting/clearing as appropos */ TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) { va = pv->pv_va; pm = pv->pv_pmap; oflags = pv->pv_flags; pv->pv_flags &= ~maskbits; #if 0 pmap_acquire_pmap_lock(pm); #endif l2b = pmap_get_l2_bucket(pm, va); ptep = &l2b->l2b_kva[l2pte_index(va)]; npte = opte = *ptep; if (maskbits & (PVF_WRITE|PVF_MOD)) { if ((pv->pv_flags & PVF_NC)) { /* * Entry is not cacheable: * * Don't turn caching on again if this is a * modified emulation. This would be * inconsitent with the settings created by * pmap_vac_me_harder(). Otherwise, it's safe * to re-enable cacheing. * * There's no need to call pmap_vac_me_harder() * here: all pages are losing their write * permission. */ if (maskbits & PVF_WRITE) { npte |= pte_l2_s_cache_mode; pv->pv_flags &= ~PVF_NC; } } else if (opte & L2_S_PROT_W) { /* * Entry is writable/cacheable: check if pmap * is current if it is flush it, otherwise it * won't be in the cache */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, (maskbits & PVF_REF) ? TRUE : FALSE, FALSE); } /* make the pte read only */ npte &= ~L2_S_PROT_W; if (maskbits & PVF_WRITE) { /* * Keep alias accounting up to date */ if (pv->pv_pmap == pmap_kernel()) { if (oflags & PVF_WRITE) { pg->md.krw_mappings--; pg->md.kro_mappings++; } } else if (oflags & PVF_WRITE) { pg->md.urw_mappings--; pg->md.uro_mappings++; } } } if (maskbits & PVF_REF) { if ((pv->pv_flags & PVF_NC) == 0 && (maskbits & (PVF_WRITE|PVF_MOD)) == 0) { /* * Check npte here; we may have already * done the wbinv above, and the validity * of the PTE is the same for opte and * npte. */ if (npte & L2_S_PROT_W) { if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, FALSE); } else if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) { /* XXXJRT need idcache_inv_range */ if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pm, pv->pv_va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pm, pv->pv_va, PAGE_SIZE, TRUE, TRUE); } } /* * Make the PTE invalid so that we will take a * page fault the next time the mapping is * referenced. */ npte &= ~L2_TYPE_MASK; npte |= L2_TYPE_INV; } if (npte != opte) { *ptep = npte; PTE_SYNC(ptep); /* Flush the TLB entry if a current pmap. */ if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pm, pv->pv_va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pm, pv->pv_va); } #if 0 pmap_release_pmap_lock(pm); #endif } #if 0 simple_unlock(&pg->mdpage.pvh_slock); PMAP_HEAD_TO_MAP_UNLOCK(); #endif } /* * main pv_entry manipulation functions: * pmap_enter_pv: enter a mapping onto a vm_page list * pmap_remove_pv: remove a mappiing from a vm_page list * * NOTE: pmap_enter_pv expects to lock the pvh itself * pmap_remove_pv expects te caller to lock the pvh before calling */ /* * pmap_enter_pv: enter a mapping onto a vm_page lst * * => caller should hold the proper lock on pmap_main_lock * => caller should have pmap locked * => we will gain the lock on the vm_page and allocate the new pv_entry * => caller should adjust ptp's wire_count before calling * => caller should not adjust pmap's wire_count */ static void pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm, vm_offset_t va, u_int flags) { pve->pv_pmap = pm; pve->pv_va = va; pve->pv_flags = flags; #if 0 mtx_lock(&pg->md.pvh_mtx); TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist); #endif TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list); pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD); if (pm == pmap_kernel()) { if (flags & PVF_WRITE) pg->md.krw_mappings++; else pg->md.kro_mappings++; } if (flags & PVF_WRITE) pg->md.urw_mappings++; else pg->md.uro_mappings++; #if 0 mtx_unlock(&pg->md.pvh_mtx); #endif if (pve->pv_flags & PVF_WIRED) ++pm->pm_stats.wired_count; } /* * * pmap_find_pv: Find a pv entry * * => caller should hold lock on vm_page */ static PMAP_INLINE struct pv_entry * pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pv; TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) if (pm == pv->pv_pmap && va == pv->pv_va) break; return (pv); } /* * vector_page_setprot: * * Manipulate the protection of the vector page. */ void vector_page_setprot(int prot) { struct l2_bucket *l2b; pt_entry_t *ptep; l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page); ptep = &l2b->l2b_kva[l2pte_index(vector_page)]; *ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot); PTE_SYNC(ptep); cpu_tlb_flushD_SE(vector_page); cpu_cpwait(); } /* * pmap_remove_pv: try to remove a mapping from a pv_list * * => caller should hold proper lock on pmap_main_lock * => pmap should be locked * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should adjust ptp's wire_count and free PTP if needed * => caller should NOT adjust pmap's wire_count * => we return the removed pve */ static struct pv_entry * pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va) { struct pv_entry *pve, **prevptr; prevptr = &TAILQ_FIRST(&pg->md.pv_list);/* previous pv_entry pointer */ pve = *prevptr; while (pve) { if (pve->pv_pmap == pm && pve->pv_va == va) { /* match? */ *prevptr = TAILQ_NEXT(pve, pv_list); /* remove it! */ if (pve->pv_flags & PVF_WIRED) --pm->pm_stats.wired_count; if (pm == pmap_kernel()) { if (pve->pv_flags & PVF_WRITE) pg->md.krw_mappings--; else pg->md.kro_mappings--; } else if (pve->pv_flags & PVF_WRITE) pg->md.urw_mappings--; else pg->md.uro_mappings--; break; } prevptr = &TAILQ_NEXT(pve, pv_list); pve = TAILQ_NEXT(pve, pv_list); } return(pve); /* return removed pve */ } /* * * pmap_modify_pv: Update pv flags * * => caller should hold lock on vm_page [so that attrs can be adjusted] * => caller should NOT adjust pmap's wire_count * => caller must call pmap_vac_me_harder() if writable status of a page * may have changed. * => we return the old flags * * Modify a physical-virtual mapping in the pv table */ static u_int pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va, u_int clr_mask, u_int set_mask) { struct pv_entry *npv; u_int flags, oflags; if ((npv = pmap_find_pv(pg, pm, va)) == NULL) return (0); /* * There is at least one VA mapping this page. */ if (clr_mask & (PVF_REF | PVF_MOD)) pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD); oflags = npv->pv_flags; npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask; if ((flags ^ oflags) & PVF_WIRED) { if (flags & PVF_WIRED) ++pm->pm_stats.wired_count; else --pm->pm_stats.wired_count; } if ((flags ^ oflags) & PVF_WRITE) { if (pm == pmap_kernel()) { if (flags & PVF_WRITE) { pg->md.krw_mappings++; pg->md.kro_mappings--; } else { pg->md.kro_mappings++; pg->md.krw_mappings--; } } else if (flags & PVF_WRITE) { pg->md.urw_mappings++; pg->md.uro_mappings--; } else { pg->md.uro_mappings++; pg->md.urw_mappings--; } } return (oflags); } /* Function to set the debug level of the pmap code */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ void pmap_pinit0(struct pmap *pmap) { PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap)); dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n", (u_int32_t) pmap, (u_int32_t) pmap->pm_pdir); pmap_pinit(pmap); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; PDEBUG(1, printf("pmap_init: phys_start = %08x\n")); /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; PDEBUG(1, printf("pmap_init: done!\n")); } int pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user) { struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; int rv = 0; #if 0 PMAP_MAP_TO_HEAD_LOCK(); pmap_acquire_pmap_lock(pm); #endif l1idx = L1_IDX(va); /* * If there is no l2_dtable for this address, then the process * has no business accessing it. * * Note: This will catch userland processes trying to access * kernel addresses. */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL) goto out; /* * Likewise if there is no L2 descriptor table */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; if (l2b->l2b_kva == NULL) goto out; /* * Check the PTE itself. */ ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; if (pte == 0) goto out; /* * Catch a userland access to the vector page mapped at 0x0 */ if (user && (pte & L2_S_PROT_U) == 0) goto out; pa = l2pte_pa(pte); if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) { /* * This looks like a good candidate for "page modified" * emulation... */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) { goto out; } /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) { goto out; } /* * Do the flags say this page is writable? If not then it * is a genuine write fault. If yes then the write fault is * our fault as we did not reflect the write access in the * PTE. Now we know a write has occurred we can correct this * and also set the modified bit */ if ((pv->pv_flags & PVF_WRITE) == 0) { goto out; } pg->md.pvh_attrs |= PVF_REF | PVF_MOD; pv->pv_flags |= PVF_REF | PVF_MOD; /* * Re-enable write permissions for the page. No need to call * pmap_vac_me_harder(), since this is just a * modified-emulation fault, and the PVF_WRITE bit isn't * changing. We've already set the cacheable bits based on * the assumption that we can write to this page. */ *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W; PTE_SYNC(ptep); rv = 1; } else if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) { /* * This looks like a good candidate for "page referenced" * emulation. */ struct pv_entry *pv; struct vm_page *pg; /* Extract the physical address of the page */ if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) goto out; /* Get the current flags for this page. */ pv = pmap_find_pv(pg, pm, va); if (pv == NULL) { goto out; } pg->md.pvh_attrs |= PVF_REF; pv->pv_flags |= PVF_REF; *ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO; PTE_SYNC(ptep); rv = 1; } /* * We know there is a valid mapping here, so simply * fix up the L1 if necessary. */ pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); rv = 1; } #ifdef CPU_SA110 /* * There are bugs in the rev K SA110. This is a check for one * of them. */ if (rv == 0 && curcpu()->ci_arm_cputype == CPU_ID_SA110 && curcpu()->ci_arm_cpurev < 3) { /* Always current pmap */ if (l2pte_valid(pte)) { extern int kernel_debug; if (kernel_debug & 1) { struct proc *p = curlwp->l_proc; printf("prefetch_abort: page is already " "mapped - pte=%p *pte=%08x\n", ptep, pte); printf("prefetch_abort: pc=%08lx proc=%p " "process=%s\n", va, p, p->p_comm); printf("prefetch_abort: far=%08x fs=%x\n", cpu_faultaddress(), cpu_faultstatus()); } #ifdef DDB if (kernel_debug & 2) Debugger(); #endif rv = 1; } } #endif /* CPU_SA110 */ #ifdef DEBUG /* * If 'rv == 0' at this point, it generally indicates that there is a * stale TLB entry for the faulting address. This happens when two or * more processes are sharing an L1. Since we don't flush the TLB on * a context switch between such processes, we can take domain faults * for mappings which exist at the same VA in both processes. EVEN IF * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for * example. * * This is extremely likely to happen if pmap_enter() updated the L1 * entry for a recently entered mapping. In this case, the TLB is * flushed for the new mapping, but there may still be TLB entries for * other mappings belonging to other processes in the 1MB range * covered by the L1 entry. * * Since 'rv == 0', we know that the L1 already contains the correct * value, so the fault must be due to a stale TLB entry. * * Since we always need to flush the TLB anyway in the case where we * fixed up the L1, or frobbed the L2 PTE, we effectively deal with * stale TLB entries dynamically. * * However, the above condition can ONLY happen if the current L1 is * being shared. If it happens when the L1 is unshared, it indicates * that other parts of the pmap are not doing their job WRT managing * the TLB. */ if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) { extern int last_fault_code; printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n", pm, va, ftype); printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n", l2, l2b, ptep, pl1pd); printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n", pte, l1pd, last_fault_code); #ifdef DDB Debugger(); #endif } #endif cpu_tlb_flushID_SE(va); cpu_cpwait(); rv = 1; out: #if 0 pmap_release_pmap_lock(pm); PMAP_MAP_TO_HEAD_UNLOCK(); #endif return (rv); } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; struct l2_bucket *l2b; struct l1_ttable *l1; pd_entry_t *pl1pt; pt_entry_t *ptep, pte; vm_offset_t va, eva; u_int loop, needed; int i; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(l2zone, 512); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); uma_zone_set_obj(l2zone, &l2zone_obj, pv_entry_max); needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0); needed -= 1; l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK); for (loop = 0; loop < needed; loop++, l1++) { /* Allocate a L1 page table */ va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, NULL, 0, 0x0, 0xffffffff, L1_TABLE_SIZE, 0); if (va == 0) panic("Cannot allocate L1 KVM"); eva = va + L1_TABLE_SIZE; pl1pt = (pd_entry_t *)va; for (i = 0; i < (L1_TABLE_SIZE / PAGE_SIZE) && va < eva; i++) { l2b = pmap_get_l2_bucket(pmap_kernel(), va); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; *ptep = pte; PTE_SYNC(ptep); cpu_tlb_flushD_SE(va); va += PAGE_SIZE; } pmap_init_l1(l1, pl1pt); } #ifdef DEBUG printf("pmap_postinit: Allocated %d static L1 descriptor tables\n", needed); #endif } /* * This is used to stuff certain critical values into the PCB where they * can be accessed quickly from cpu_switch() et al. */ void pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb) { struct l2_bucket *l2b; pcb->pcb_pagedir = pm->pm_l1->l1_physaddr; pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) | (DOMAIN_CLIENT << (pm->pm_domain * 2)); pcb->pcb_cstate = (void *)&pm->pm_cstate; if (vector_page < KERNBASE) { pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)]; l2b = pmap_get_l2_bucket(pm, vector_page); pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO | L1_C_DOM(pm->pm_domain); } else pcb->pcb_pl1vec = NULL; } void pmap_activate(struct thread *td) { pmap_t pm; struct pcb *pcb; int s; pm = td->td_proc->p_vmspace->vm_map.pmap; pcb = td->td_pcb; critical_enter(); pmap_set_pcb_pagedir(pm, pcb); if (td == curthread) { u_int cur_dacr, cur_ttb; __asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb)); __asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr)); cur_ttb &= ~(L1_TABLE_SIZE - 1); if (cur_ttb == (u_int)pcb->pcb_pagedir && cur_dacr == pcb->pcb_dacr) { /* * No need to switch address spaces. */ critical_exit(); return; } disable_interrupts(I32_bit | F32_bit); /* * We MUST, I repeat, MUST fix up the L1 entry corresponding * to 'vector_page' in the incoming L1 table before switching * to it otherwise subsequent interrupts/exceptions (including * domain faults!) will jump into hyperspace. */ if (pcb->pcb_pl1vec) { *pcb->pcb_pl1vec = pcb->pcb_l1vec; /* * Don't need to PTE_SYNC() at this point since * cpu_setttb() is about to flush both the cache * and the TLB. */ } cpu_domains(pcb->pcb_dacr); cpu_setttb(pcb->pcb_pagedir); enable_interrupts(I32_bit | F32_bit); /* * Flag any previous userland pmap as being NOT * resident in the cache/tlb. */ if (pmap_cache_state && pmap_cache_state != &pm->pm_cstate) pmap_cache_state->cs_all = 0; /* * The new pmap, however, IS resident. */ pmap_cache_state = &pm->pm_cstate; pm->pm_cstate.cs_all = PMAP_CACHE_STATE_ALL; splx(s); } critical_exit(); } static int pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va) { pd_entry_t *pdep, pde; pt_entry_t *ptep, pte; vm_offset_t pa; int rv = 0; /* * Make sure the descriptor itself has the correct cache mode */ pdep = &kl1[L1_IDX(va)]; pde = *pdep; if (l1pte_section_p(pde)) { if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) { *pdep = (pde & ~L1_S_CACHE_MASK) | pte_l1_s_cache_mode_pt; PTE_SYNC(pdep); cpu_dcache_wbinv_range((vm_offset_t)pdep, sizeof(*pdep)); rv = 1; } } else { pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep); ptep = &ptep[l2pte_index(va)]; pte = *ptep; if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) { *ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt; PTE_SYNC(ptep); cpu_dcache_wbinv_range((vm_offset_t)ptep, sizeof(*ptep)); rv = 1; } } return (rv); } static void pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap, pt_entry_t **ptep) { vm_offset_t va = *availp; struct l2_bucket *l2b; if (ptep) { l2b = pmap_get_l2_bucket(pmap_kernel(), va); if (l2b == NULL) panic("pmap_alloc_specials: no l2b for 0x%x", va); *ptep = &l2b->l2b_kva[l2pte_index(va)]; } *vap = va; *availp = va + (PAGE_SIZE * pages); } /* * Bootstrap the system enough to run with virtual memory. * * On the arm this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define PMAP_STATIC_L2_SIZE 16 void pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt) { static struct l1_ttable static_l1; static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE]; struct l1_ttable *l1 = &static_l1; struct l2_dtable *l2; struct l2_bucket *l2b; pd_entry_t pde; pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va; pt_entry_t *ptep; vm_paddr_t pa; vm_offset_t va; int l1idx, l2idx, l2next = 0; PDEBUG(1, printf("firstaddr = %08x, loadaddr = %08x\n", firstaddr, loadaddr)); virtual_avail = firstaddr; kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_l1 = l1; /* * Scan the L1 translation table created by initarm() and create * the required metadata for all valid mappings found in it. */ for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) { pde = kernel_l1pt[l1idx]; /* * We're only interested in Coarse mappings. * pmap_extract() can deal with section mappings without * recourse to checking L2 metadata. */ if ((pde & L1_TYPE_MASK) != L1_TYPE_C) continue; /* * Lookup the KVA of this L2 descriptor table */ pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK); ptep = (pt_entry_t *)kernel_pt_lookup(pa); if (ptep == NULL) { panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx", (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa); } /* * Fetch the associated L2 metadata structure. * Allocate a new one if necessary. */ if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) { if (l2next == PMAP_STATIC_L2_SIZE) panic("pmap_bootstrap: out of static L2s"); kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 = &static_l2[l2next++]; } /* * One more L1 slot tracked... */ l2->l2_occupancy++; /* * Fill in the details of the L2 descriptor in the * appropriate bucket. */ l2b = &l2->l2_bucket[L2_BUCKET(l1idx)]; l2b->l2b_kva = ptep; l2b->l2b_phys = pa; l2b->l2b_l1idx = l1idx; /* * Establish an initial occupancy count for this descriptor */ for (l2idx = 0; l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t)); l2idx++) { if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) { l2b->l2b_occupancy++; } } /* * Make sure the descriptor itself has the correct cache mode. * If not, fix it, but whine about the problem. Port-meisters * should consider this a clue to fix up their initarm() * function. :) */ if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) { printf("pmap_bootstrap: WARNING! wrong cache mode for " "L2 pte @ %p\n", ptep); } } /* * Initialize protection array. */ arm_protection_init(); /* * Ensure the primary (kernel) L1 has the correct cache mode for * a page table. Bitch if it is not correctly set. */ for (va = (vm_offset_t)kernel_l1pt; va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) { if (pmap_set_pt_cache_mode(kernel_l1pt, va)) printf("pmap_bootstrap: WARNING! wrong cache mode for " "primary L1 @ 0x%x\n", va); } cpu_dcache_wbinv_all(); cpu_tlb_flushID(); cpu_cpwait(); kernel_pmap->pm_active = -1; kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL; TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte); pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte); pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte); TAILQ_INIT(&l1_lru_list); mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF); pmap_init_l1(l1, kernel_l1pt); cpu_dcache_wbinv_all(); virtual_avail = round_page(virtual_avail); virtual_end = lastaddr; kernel_vm_end = virtual_end; } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { pmap_free_l1(pmap); dprintf("pmap_release()\n"); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, AP_KRWURW, FALSE); } else { pmap_remove_all(m); } } } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct l2_bucket *l2b; pt_entry_t *pte, tpte; pv_entry_t pv, npv; - int s; vm_page_t m; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curproc || (pmap != vmspace_pmap(curproc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif - s = splvm(); + vm_page_lock_queues(); for(pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } /* * We cannot remove a wired pages from a process' mapping * at this time */ if (pv->pv_flags & PT_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } l2b = pmap_get_l2_bucket(pmap_kernel(), pv->pv_va); pte = &l2b->l2b_kva[l2pte_index(pv->pv_va)]; tpte = *pte; *pte = 0; m = PHYS_TO_VM_PAGE(tpte); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %x", tpte)); pv->pv_pmap->pm_stats.resident_count--; /* * Update the vm_page_t clean and reference bits. */ vm_page_dirty(m); npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_REFERENCED); } pmap_free_l2_bucket(pv->pv_pmap, l2b, 1); pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); pmap_free_pv_entry(pv); } - splx(s); pmap_invalidate_tlb_all(pmap); + vm_page_unlock_queues(); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a invltlb after doing the pmap_kenter... */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_offset_t pa) { struct l2_bucket *l2b; pt_entry_t *pte; pt_entry_t opte; PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n", (uint32_t) va, (uint32_t) pa)); l2b = pmap_get_l2_bucket(pmap_kernel(), va); KASSERT(l2b != NULL, ("No L2 Bucket")); pte = &l2b->l2b_kva[l2pte_index(va)]; opte = *pte; PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n", (uint32_t) pte, opte, *pte)); if (l2pte_valid(opte)) { cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); cpu_cpwait(); } else if (opte == 0) l2b->l2b_occupancy++; *pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(pte); } /* * remove a page from the kernel pagetables */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)vtopte(va); *pte = 0; pmap_invalidate_page(kernel_pmap, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { vm_offset_t sva = *virt; vm_offset_t va = sva; PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, " "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end, prot)); while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; for (i = 0; i < count; i++) { pmap_kenter(va, VM_PAGE_TO_PHYS(m[i])); va += PAGE_SIZE; } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { int i; for (i = 0; i < count; i++) { pmap_kremove(va); va += PAGE_SIZE; } } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { printf("pmap_object_init_pt()\n"); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { return (FALSE); } static PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { /* TODO: Invalidate I+D_SE */ __asm("mcr p15, 0, r0, c7, c7, 0"); } #if 0 static PMAP_INLINE void pmap_invalidate_tlb(pmap_t pmap, vm_offset_t va) { __asm("mcr p15, 0, r0, c8, c7, 0"); /* TODO: Invalidate TLB */ } #endif static PMAP_INLINE void pmap_invalidate_tlb_all(pmap_t pmap) { __asm("mcr p15, 0, r0, c8, c7, 0"); /* TODO: Invalidate all TLB */ } static PMAP_INLINE void pmap_changebit(vm_page_t m, int bit, boolean_t setem) { pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; s = splvm(); /* * Loop over all current mappings setting/clearing as appropos */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { /* * don't write protect pager mappings */ if (!setem && bit == AP_KRWURW) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DEBUG) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", (uint32_t) pv->pv_va); continue; } #endif pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (setem) { *pte |= bit; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } else { pt_entry_t pbits = *pte; if (pbits & bit) { *pte = pbits & ~bit; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } } } splx(s); } /* * Fetch pointers to the PDE/PTE for the given pmap/VA pair. * Returns TRUE if the mapping exists, else FALSE. * * NOTE: This function is only used by a couple of arm-specific modules. * It is not safe to take any pmap locks here, since we could be right * in the middle of debugging the pmap anyway... * * It is possible for this routine to return FALSE even though a valid * mapping does exist. This is because we don't lock, so the metadata * state may be inconsistent. * * NOTE: We can return a NULL *ptp in the case where the L1 pde is * a "section" mapping. */ boolean_t pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep; u_short l1idx; if (pm->pm_l1 == NULL) return (FALSE); l1idx = L1_IDX(va); *pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = *pl1pd; if (l1pte_section_p(l1pd)) { *ptp = NULL; return (TRUE); } if (pm->pm_l2 == NULL) return (FALSE); l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { return (FALSE); } *ptp = &ptep[l2pte_index(va)]; return (TRUE); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte, tpte; int s; #if defined(PMAP_DEBUG) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte(pv->pv_pmap, pv->pv_va); tpte = atomic_readandclear_int(pte); if (pv->pv_flags & PT_W) pv->pv_pmap->pm_stats.wired_count--; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); pmap_idcache_wbinv_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE); pmap_free_pv_entry(pv); } splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_offset_t next_bucket; u_int flags; int flush; if ((prot & VM_PROT_READ) == 0) { pmap_remove(pm, sva, eva); return; } if (prot & VM_PROT_WRITE) { /* * If this is a read->write transition, just ignore it and let * uvm_fault() take care of it later. */ return; } /* * OK, at this point, we know we're doing write-protect operation. * If the pmap is active, write-back the range. */ pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE); flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1; flags = 0; while (sva < eva) { next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; while (sva < next_bucket) { if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) { struct vm_page *pg; u_int f; pg = PHYS_TO_VM_PAGE(l2pte_pa(pte)); pte &= ~L2_S_PROT_W; *ptep = pte; PTE_SYNC(ptep); if (pg != NULL) { f = pmap_modify_pv(pg, pm, sva, PVF_WRITE, 0); pmap_vac_me_harder(pg, pm, sva); } else f = PVF_REF | PVF_EXEC; if (flush >= 0) { flush++; flags |= f; } else if (PV_BEEN_EXECD(f)) pmap_tlb_flushID_SE(pm, sva); else if (PV_BEEN_REFD(f)) pmap_tlb_flushD_SE(pm, sva); } sva += PAGE_SIZE; ptep++; } } if (flush) { if (PV_BEEN_EXECD(flags)) pmap_tlb_flushID(pm); else if (PV_BEEN_REFD(flags)) pmap_tlb_flushD(pm); } } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ vm_offset_t getttb(void); void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { struct l2_bucket *l2b; struct vm_page *opg; struct pv_entry *pve; pt_entry_t *ptep, npte, opte; u_int nflags; u_int oflags; vm_paddr_t pa; if (va == vector_page) { pa = systempage.pv_pa; m = NULL; } else pa = VM_PAGE_TO_PHYS(m); nflags = 0; if (prot & VM_PROT_WRITE) nflags |= PVF_WRITE; if (prot & VM_PROT_EXECUTE) nflags |= PVF_EXEC; if (wired) nflags |= PVF_WIRED; PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, " "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired)); if (pmap == pmap_kernel()) l2b = pmap_get_l2_bucket(pmap, va); else { l2b = pmap_alloc_l2_bucket(pmap, va); } if (l2b == NULL) panic("pmap_enter: failed to allocate l2 bucket"); ptep = &l2b->l2b_kva[l2pte_index(va)]; /* * Page table entry not valid, we need a new PT page */ if (ptep == NULL) panic("pmap_enter: invalid page table pte=%p, va=0x%x 0x%x\n", ptep, va, UPT_MIN_ADDRESS); opte = *(vm_offset_t *)ptep; npte = pa; oflags = 0; if (opte) { /* * There is already a mapping at this address. * If the physical address is different, lookup the * vm_page. */ if (l2pte_pa(opte) != pa) opg = PHYS_TO_VM_PAGE(l2pte_pa(opte)); else opg = m; } else opg = NULL; if (m && !(m->flags & PG_UNMANAGED)) { /* * This is to be a managed mapping. */ if ((prot & (VM_PROT_ALL)) || (m->md.pvh_attrs & PVF_REF)) { /* * - The access type indicates that we don't need * to do referenced emulation. * OR * - The physical page has already been referenced * so no need to re-do referenced emulation here. */ npte |= L2_S_PROTO; nflags |= PVF_REF; if (((prot & VM_PROT_WRITE) != 0 && ((m->flags & PG_WRITEABLE) || (m->md.pvh_attrs & PVF_MOD) != 0))) { /* * This is a writable mapping, and the * page's mod state indicates it has * already been modified. Make it * writable from the outset. */ npte |= L2_S_PROT_W; nflags |= PVF_MOD; } } else { /* * Need to do page referenced emulation. */ npte |= L2_TYPE_INV; } npte |= pte_l2_s_cache_mode; if (m == opg) { /* * We're changing the attrs of an existing mapping. */ #if 0 simple_lock(&pg->mdpage.pvh_slock); #endif oflags = pmap_modify_pv(m, pmap, va, PVF_WRITE | PVF_EXEC | PVF_WIRED | PVF_MOD | PVF_REF, nflags); #if 0 simple_unlock(&pg->mdpage.pvh_slock); #endif /* * We may need to flush the cache if we're * doing rw-ro... */ if (pmap->pm_cstate.cs_cache_d && (oflags & PVF_NC) == 0 && (opte & L2_S_PROT_W) != 0 && (prot & VM_PROT_WRITE) == 0) cpu_dcache_wb_range(va, PAGE_SIZE); } else { /* * New mapping, or changing the backing page * of an existing mapping. */ if (opg) { /* * Replacing an existing mapping with a new one. * It is part of our managed memory so we * must remove it from the PV list */ #if 0 simple_lock(&opg->mdpage.pvh_slock); #endif pve = pmap_remove_pv(opg, pmap, va); pmap_vac_me_harder(opg, pmap, 0); #if 0 simple_unlock(&opg->mdpage.pvh_slock); #endif oflags = pve->pv_flags; /* * If the old mapping was valid (ref/mod * emulation creates 'invalid' mappings * initially) then make sure to frob * the cache. */ if ((oflags & PVF_NC) == 0 && l2pte_valid(opte)) { if (PV_BEEN_EXECD(oflags)) { pmap_idcache_wbinv_range(pmap, va, PAGE_SIZE); } else if (PV_BEEN_REFD(oflags)) { pmap_dcache_wb_range(pmap, va, PAGE_SIZE, TRUE, (oflags & PVF_WRITE) == 0); } } } else if ((pve = pmap_get_pv_entry()) == NULL) { panic("pmap_enter: no pv entries"); } pmap_enter_pv(m, pve, pmap, va, nflags); } } else { /* * We're mapping an unmanaged page. * These are always readable, and possibly writable, from * the get go as we don't need to track ref/mod status. */ npte |= L2_S_PROTO; if (prot & VM_PROT_WRITE) { npte |= L2_S_PROT_W; } /* * Make sure the vector table is mapped cacheable */ if (pmap != pmap_kernel() && va == vector_page) npte |= pte_l2_s_cache_mode; if (opg) { /* * Looks like there's an existing 'managed' mapping * at this address. */ #if 0 simple_lock(&opg->mdpage.pvh_slock); #endif pve = pmap_remove_pv(opg, pmap, va); pmap_vac_me_harder(opg, pmap, 0); #if 0 simple_unlock(&opg->mdpage.pvh_slock); #endif oflags = pve->pv_flags; if ((oflags & PVF_NC) == 0 && l2pte_valid(opte)) { if (PV_BEEN_EXECD(oflags)) pmap_idcache_wbinv_range(pmap, va, PAGE_SIZE); else if (PV_BEEN_REFD(oflags)) pmap_dcache_wb_range(pmap, va, PAGE_SIZE, TRUE, (oflags & PVF_WRITE) == 0); } } } /* * Make sure userland mappings get the right permissions */ if (pmap != pmap_kernel() && va != vector_page) { npte |= L2_S_PROT_U; } /* * Keep the stats up to date */ if (opte == 0) { l2b->l2b_occupancy++; pmap->pm_stats.resident_count++; } /* * If this is just a wiring change, the two PTEs will be * identical, so there's no need to update the page table. */ if (npte != opte) { boolean_t is_cached = pmap_is_cached(pmap); *ptep = npte; if (is_cached) { /* * We only need to frob the cache/tlb if this pmap * is current */ PTE_SYNC(ptep); if (L1_IDX(va) != L1_IDX(vector_page) && l2pte_valid(npte)) { /* * This mapping is likely to be accessed as * soon as we return to userland. Fix up the * L1 entry to avoid taking another * page/domain fault. */ pd_entry_t *pl1pd, l1pd; pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)]; l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) | L1_C_PROTO; if (*pl1pd != l1pd) { *pl1pd = l1pd; PTE_SYNC(pl1pd); } } } if (PV_BEEN_EXECD(oflags)) pmap_tlb_flushID_SE(pmap, va); else if (PV_BEEN_REFD(oflags)) pmap_tlb_flushD_SE(pmap, va); if (m && !(m->flags & PG_UNMANAGED)) { #if 0 simple_lock(&pg->mdpage.pvh_slock); #endif pmap_vac_me_harder(m, pmap, va); #if 0 simple_unlock(&pg->mdpage.pvh_slock); #endif } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { pmap_enter(pmap, va, m, VM_PROT_READ | VM_PROT_EXECUTE, FALSE); return (NULL); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) { struct l2_bucket *l2b; pt_entry_t *ptep, pte; vm_page_t pg; l2b = pmap_get_l2_bucket(pmap, va); KASSERT(l2b, ("No l2b bucket in pmap_change_wiring")); ptep = &l2b->l2b_kva[l2pte_index(va)]; pte = *ptep; pg = PHYS_TO_VM_PAGE(l2pte_pa(pte)); if (pg) pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < clean_sva) || (va >= clean_eva)) return 1; else return 0; } /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. */ static pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { struct l2_bucket *l2b; l2b = pmap_get_l2_bucket(pmap, va); if (l2b == NULL) return (NULL); return (&l2b->l2b_kva[l2pte_index(va)]); } vm_paddr_t pmap_kextract(vm_offset_t va) { return (pmap_extract(pmap_kernel(), va)); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pm, vm_offset_t va) { struct l2_dtable *l2; pd_entry_t *pl1pd, l1pd; pt_entry_t *ptep, pte; vm_paddr_t pa; u_int l1idx; l1idx = L1_IDX(va); pl1pd = &pm->pm_l1->l1_kva[l1idx]; l1pd = *pl1pd; if (l1pte_section_p(l1pd)) { /* * These should only happen for pmap_kernel() */ KASSERT(pm == pmap_kernel(), ("huh")); pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET); } else { /* * Note that we can't rely on the validity of the L1 * descriptor as an indication that a mapping exists. * We have to look it up in the L2 dtable. */ l2 = pm->pm_l2[L2_IDX(l1idx)]; if (l2 == NULL || (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) { return (0); } ptep = &ptep[l2pte_index(va)]; pte = *ptep; if (pte == 0) return (0); switch (pte & L2_TYPE_MASK) { case L2_TYPE_L: pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET); break; default: pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET); break; } } return (pa); } vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa; vm_page_t m; m = NULL; mtx_lock(&Giant); if ((pa = pmap_extract(pmap, va)) != 0) { m = PHYS_TO_VM_PAGE(pa); vm_page_lock_queues(); vm_page_hold(m); vm_page_unlock_queues(); } mtx_unlock(&Giant); return (m); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { if (va >= UPT_MIN_ADDRESS) return 0; return pmap_unwire_pte_hold(pmap, mpte); } void pmap_update(pmap_t pm) { if (pmap_is_current(pm)) { /* * If we're dealing with a current userland pmap, move its L1 * to the end of the LRU. */ if (pm != pmap_kernel()) pmap_use_l1(pm); /* * We can assume we're done with frobbing the cache/tlb for * now. Make sure any future pmap ops don't skip cache/tlb * flushes. */ pm->pm_cstate.cs_all = PMAP_CACHE_STATE_ALL; } /* * make sure TLB/cache operations have completed. */ cpu_cpwait(); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap_t pmap) { PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap)); pmap_alloc_l1(pmap); bzero(pmap->pm_l2, sizeof(pmap->pm_l2)); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_stats.resident_count = 1; if (vector_page < KERNBASE) { pmap_enter(pmap, vector_page, PHYS_TO_VM_PAGE(systempage.pv_pa), VM_PROT_READ, 1); pmap_update(pmap); } } /*************************************************** * page management routines. ***************************************************/ static PMAP_INLINE void pmap_free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t pmap_get_pv_entry(void) { pv_entry_t ret_value; pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } ret_value = uma_zalloc(pvzone, M_NOWAIT); return ret_value; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ #define PMAP_REMOVE_CLEAN_LIST_SIZE 3 void pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva) { struct l2_bucket *l2b; vm_offset_t next_bucket; pt_entry_t *ptep; u_int cleanlist_idx, total, cnt; struct { vm_offset_t va; pt_entry_t *pte; } cleanlist[PMAP_REMOVE_CLEAN_LIST_SIZE]; u_int mappings, is_exec, is_refd; /* * we lock in the pmap => pv_head direction */ #if 0 PMAP_MAP_TO_HEAD_LOCK(); pmap_acquire_pmap_lock(pm); #endif if (!pmap_is_cached(pm)) { cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1; } else cleanlist_idx = 0; total = 0; while (sva < eva) { /* * Do one L2 bucket's worth at a time. */ next_bucket = L2_NEXT_BUCKET(sva); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pm, sva); if (l2b == NULL) { sva = next_bucket; continue; } ptep = &l2b->l2b_kva[l2pte_index(sva)]; mappings = 0; while (sva < next_bucket) { struct vm_page *pg; pt_entry_t pte; vm_paddr_t pa; pte = *ptep; if (pte == 0) { /* * Nothing here, move along */ sva += PAGE_SIZE; ptep++; continue; } pm->pm_stats.resident_count--; pa = l2pte_pa(pte); is_exec = 0; is_refd = 1; /* * Update flags. In a number of circumstances, * we could cluster a lot of these and do a * number of sequential pages in one go. */ if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) { struct pv_entry *pve; #if 0 simple_lock(&pg->mdpage.pvh_slock); #endif pve = pmap_remove_pv(pg, pm, sva); pmap_vac_me_harder(pg, pm, 0); #if 0 simple_unlock(&pg->mdpage.pvh_slock); #endif if (pve != NULL) { is_exec = PV_BEEN_EXECD(pve->pv_flags); is_refd = PV_BEEN_REFD(pve->pv_flags); pmap_free_pv_entry(pve); } } if (!l2pte_valid(pte)) { *ptep = 0; PTE_SYNC_CURRENT(pm, ptep); sva += PAGE_SIZE; ptep++; mappings++; continue; } if (cleanlist_idx < PMAP_REMOVE_CLEAN_LIST_SIZE) { /* Add to the clean list. */ cleanlist[cleanlist_idx].pte = ptep; cleanlist[cleanlist_idx].va = sva | (is_exec & 1); cleanlist_idx++; } else if (cleanlist_idx == PMAP_REMOVE_CLEAN_LIST_SIZE) { /* Nuke everything if needed. */ pmap_idcache_wbinv_all(pm); pmap_tlb_flushID(pm); /* * Roll back the previous PTE list, * and zero out the current PTE. */ for (cnt = 0; cnt < PMAP_REMOVE_CLEAN_LIST_SIZE; cnt++) { *cleanlist[cnt].pte = 0; } *ptep = 0; PTE_SYNC(ptep); cleanlist_idx++; } else { *ptep = 0; PTE_SYNC(ptep); if (is_exec) pmap_tlb_flushID_SE(pm, sva); else if (is_refd) pmap_tlb_flushD_SE(pm, sva); } sva += PAGE_SIZE; ptep++; mappings++; } /* * Deal with any left overs */ if (cleanlist_idx <= PMAP_REMOVE_CLEAN_LIST_SIZE) { total += cleanlist_idx; for (cnt = 0; cnt < cleanlist_idx; cnt++) { if (pm->pm_cstate.cs_all != 0) { vm_offset_t clva = cleanlist[cnt].va & ~1; if (cleanlist[cnt].va & 1) { pmap_idcache_wbinv_range(pm, clva, PAGE_SIZE); pmap_tlb_flushID_SE(pm, clva); } else { pmap_dcache_wb_range(pm, clva, PAGE_SIZE, TRUE, FALSE); pmap_tlb_flushD_SE(pm, clva); } } *cleanlist[cnt].pte = 0; PTE_SYNC_CURRENT(pm, cleanlist[cnt].pte); } /* * If it looks like we're removing a whole bunch * of mappings, it's faster to just write-back * the whole cache now and defer TLB flushes until * pmap_update() is called. */ if (total <= PMAP_REMOVE_CLEAN_LIST_SIZE) cleanlist_idx = 0; else { cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1; pmap_idcache_wbinv_all(pm); } } pmap_free_l2_bucket(pm, l2b, mappings); } #if 0 pmap_release_pmap_lock(pm); PMAP_MAP_TO_HEAD_UNLOCK(); #endif } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { return 0; } /* * pmap_zero_page() * * Zero a given physical page by mapping it at a page hook point. * In doing the zero page op, the page we zero is mapped cachable, as with * StrongARM accesses to non-cached pages are non-burst making writing * _any_ bulk data very slow. */ #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 void pmap_zero_page_generic(vm_paddr_t phys, int off, int size) { #ifdef DEBUG struct vm_page *pg = PHYS_TO_VM_PAGE(phys); if (pg->md.pvh_list != NULL) panic("pmap_zero_page: page has mappings"); #endif /* * Hook in the page, zero it, and purge the cache for that * zeroed page. Invalidate the TLB as needed. */ *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); if (off || size) bzero((void *)(cdstp + off), size); else bzero_page(cdstp); cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); } #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if ARM_MMU_XSCALE == 1 void pmap_zero_page_xscale(vm_paddr_t phys, int off, int size) { #ifdef DEBUG struct vm_page *pg = PHYS_TO_VM_PAGE(phys); if (pg->md.pvh_list != NULL) panic("pmap_zero_page: page has mappings"); #endif /* * Hook in the page, zero it, and purge the cache for that * zeroed page. Invalidate the TLB as needed. */ *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); if (off || size) bzero((void *)(cdstp + off), size); else bzero_page(cdstp); xscale_cache_clean_minidata(); } /* * Change the PTEs for the specified kernel mappings such that they * will use the mini data cache instead of the main data cache. */ void pmap_uarea(vm_offset_t va) { struct l2_bucket *l2b; pt_entry_t *ptep, *sptep, pte; vm_offset_t next_bucket, eva; #if (ARM_NMMUS > 1) if (xscale_use_minidata == 0) return; #endif eva = va + USPACE; while (va < eva) { next_bucket = L2_NEXT_BUCKET(va); if (next_bucket > eva) next_bucket = eva; l2b = pmap_get_l2_bucket(pmap_kernel(), va); sptep = ptep = &l2b->l2b_kva[l2pte_index(va)]; while (va < next_bucket) { pte = *ptep; if (!l2pte_minidata(pte)) { cpu_dcache_wbinv_range(va, PAGE_SIZE); cpu_tlb_flushD_SE(va); *ptep = pte & ~L2_B; } ptep++; va += PAGE_SIZE; } PTE_SYNC_RANGE(sptep, (u_int)(ptep - sptep)); } cpu_cpwait(); } #endif /* ARM_MMU_XSCALE == 1 */ /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { pmap_zero_page_func(VM_PAGE_TO_PHYS(m), 0, 0); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pmap_zero_page_func(VM_PAGE_TO_PHYS(m), off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { unsigned int i; int *ptr; vm_paddr_t phys = VM_PAGE_TO_PHYS(m); pmap_zero_page(m); return; *cdst_pte = L2_S_PROTO | phys | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); for (i = 0, ptr = (int *)cdstp; i < (PAGE_SIZE / sizeof(int)); i++) { *ptr = 0; } cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); } /* * pmap_clean_page() * * This is a local function used to work out the best strategy to clean * a single page referenced by its entry in the PV table. It's used by * pmap_copy_page, pmap_zero page and maybe some others later on. * * Its policy is effectively: * o If there are no mappings, we don't bother doing anything with the cache. * o If there is one mapping, we clean just that page. * o If there are multiple mappings, we clean the entire cache. * * So that some functions can be further optimised, it returns 0 if it didn't * clean the entire cache, or 1 if it did. * * XXX One bug in this routine is that if the pv_entry has a single page * mapped at 0x00000000 a whole cache clean will be performed rather than * just the 1 page. Since this should not occur in everyday use and if it does * it will just result in not the most efficient clean for the page. */ static int pmap_clean_page(struct pv_entry *pv, boolean_t is_src) { pmap_t pm, pm_to_clean = NULL; struct pv_entry *npv; u_int cache_needs_cleaning = 0; u_int flags = 0; vm_offset_t page_to_clean = 0; if (pv == NULL) { /* nothing mapped in so nothing to flush */ return (0); } /* * Since we flush the cache each time we change to a different * user vmspace, we only need to flush the page if it is in the * current pmap. */ if (curproc) pm = curproc->p_vmspace->vm_map.pmap; else pm = pmap_kernel(); for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) { if (npv->pv_pmap == pmap_kernel() || npv->pv_pmap == pm) { flags |= npv->pv_flags; /* * The page is mapped non-cacheable in * this map. No need to flush the cache. */ if (npv->pv_flags & PVF_NC) { #ifdef DIAGNOSTIC if (cache_needs_cleaning) panic("pmap_clean_page: " "cache inconsistency"); #endif break; } else if (is_src && (npv->pv_flags & PVF_WRITE) == 0) continue; if (cache_needs_cleaning) { page_to_clean = 0; break; } else { page_to_clean = npv->pv_va; pm_to_clean = npv->pv_pmap; } cache_needs_cleaning = 1; } } if (page_to_clean) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_range(pm_to_clean, page_to_clean, PAGE_SIZE); else pmap_dcache_wb_range(pm_to_clean, page_to_clean, PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0); } else if (cache_needs_cleaning) { if (PV_BEEN_EXECD(flags)) pmap_idcache_wbinv_all(pm); else pmap_dcache_wbinv_all(pm); return (1); } return (0); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ /* * pmap_copy_page() * * Copy one physical page into another, by mapping the pages into * hook points. The same comment regarding cachability as in * pmap_zero_page also applies here. */ #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 void pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst) { struct vm_page *src_pg = PHYS_TO_VM_PAGE(src); #ifdef DEBUG struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst); if (dst_pg->md.pvh_list != NULL) panic("pmap_copy_page: dst page has mappings"); #endif /* * Clean the source page. Hold the source page's lock for * the duration of the copy so that no other mappings can * be created while we have a potentially aliased mapping. */ #if 0 mtx_lock(&src_pg->md.pvh_mtx); #endif (void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE); /* * Map the pages into the page hook points, copy them, and purge * the cache for the appropriate page. Invalidate the TLB * as required. */ *csrc_pte = L2_S_PROTO | src | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode; PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | dst | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode; PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy_page(csrcp, cdstp); cpu_dcache_inv_range(csrcp, PAGE_SIZE); #if 0 mtx_lock(&src_pg->md.pvh_mtx); #endif cpu_dcache_wbinv_range(cdstp, PAGE_SIZE); } #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */ #if ARM_MMU_XSCALE == 1 void pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst) { struct vm_page *src_pg = PHYS_TO_VM_PAGE(src); #ifdef DEBUG struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst); if (dst_pg->md.pvh_list != NULL) panic("pmap_copy_page: dst page has mappings"); #endif /* * Clean the source page. Hold the source page's lock for * the duration of the copy so that no other mappings can * be created while we have a potentially aliased mapping. */ (void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE); /* * Map the pages into the page hook points, copy them, and purge * the cache for the appropriate page. Invalidate the TLB * as required. */ *csrc_pte = L2_S_PROTO | src | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(csrc_pte); *cdst_pte = L2_S_PROTO | dst | L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X); /* mini-data */ PTE_SYNC(cdst_pte); cpu_tlb_flushD_SE(csrcp); cpu_tlb_flushD_SE(cdstp); cpu_cpwait(); bcopy_page(csrcp, cdstp); xscale_cache_clean_minidata(); } #endif /* ARM_MMU_XSCALE == 1 */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pmap_copy_page_func(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst)); } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (FALSE); s = splvm(); /* * Not found, check current mappings returning immediately */ for (pv = TAILQ_FIRST(&m->md.pv_list); pv; pv = TAILQ_NEXT(pv, pv_list)) { if (pv->pv_pmap == pmap) { splx(s); return (TRUE); } loops++; if (loops >= 16) break; } splx(s); return (FALSE); } /* * pmap_ts_referenced: * * Return the count of reference bits for a page, clearing all of them. */ int pmap_ts_referenced(vm_page_t m) { printf("pmap_ts_referenced()\n"); return (0); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { printf("pmap_is_modified()\n"); return(FALSE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { if (m->md.pvh_attrs & PVF_MOD) pmap_clearbit(m, PVF_MOD); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { if (m->md.pvh_attrs & PVF_REF) pmap_clearbit(m, PVF_REF); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { printf("pmap_mincore()\n"); return (0); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return(addr); } static void arm_protection_init(void) { int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: *kp++ = 0; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PT_AP(AP_KRW); } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { vm_offset_t va, tmpva, offset; pt_entry_t *pte; /* XXX: pmap_mapdev is wrong. */ offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0;) { pte = vtopte((vm_offset_t)vtopte(tmpva)); *pte = L2_PTE(pa, AP_KRW); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_tlb_all(kernel_pmap); return ((void *)(va + offset)); } #define BOOTSTRAP_DEBUG /* * pmap_map_section: * * Create a single section mapping. */ void pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pd_entry_t fl; KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l1_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l1_s_cache_mode_pt; break; } pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); } /* * pmap_link_l2pt: * * Link the L2 page table specified by "pa" into the L1 * page table at the slot for "va". */ void pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv) { pd_entry_t *pde = (pd_entry_t *) l1pt, proto; u_int slot = va >> L1_S_SHIFT; #ifndef ARM32_NEW_VM_LAYOUT KASSERT((va & ((L1_S_SIZE * 4) - 1)) == 0, ("blah")); KASSERT((l2pv->pv_pa & PAGE_MASK) == 0, ("ouin")); #endif proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO; pde[slot + 0] = proto | (l2pv->pv_pa + 0x000); #ifdef ARM32_NEW_VM_LAYOUT PTE_SYNC(&pde[slot]); #else pde[slot + 1] = proto | (l2pv->pv_pa + 0x400); pde[slot + 2] = proto | (l2pv->pv_pa + 0x800); pde[slot + 3] = proto | (l2pv->pv_pa + 0xc00); PTE_SYNC_RANGE(&pde[slot + 0], 4); #endif SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list); } /* * pmap_map_entry * * Create a single page mapping. */ void pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t fl; pt_entry_t *pte; KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin")); switch (cache) { case PTE_NOCACHE: default: fl = 0; break; case PTE_CACHE: fl = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: fl = pte_l2_s_cache_mode_pt; break; } if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_entry: no L2 table for VA 0x%08x", va); #ifndef ARM32_NEW_VM_LAYOUT pte = (pt_entry_t *) kernel_pt_lookup(pde[va >> L1_S_SHIFT] & L2_S_FRAME); #else pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK); #endif if (pte == NULL) panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va); #ifndef ARM32_NEW_VM_LAYOUT pte[(va >> PAGE_SHIFT) & 0x3ff] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl; PTE_SYNC(&pte[(va >> PAGE_SHIFT) & 0x3ff]); #else pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl; PTE_SYNC(&pte[l2pte_index(va)]); #endif } /* * pmap_map_chunk: * * Map a chunk of memory using the most efficient mappings * possible (section. large page, small page) into the * provided L1 and L2 tables at the specified virtual address. */ vm_size_t pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, vm_size_t size, int prot, int cache) { pd_entry_t *pde = (pd_entry_t *) l1pt; pt_entry_t *pte, f1, f2s, f2l; vm_size_t resid; int i; resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); if (l1pt == 0) panic("pmap_map_chunk: no L1 table provided"); #ifdef VERBOSE_INIT_ARM printf("pmap_map_chunk: pa=0x%lx va=0x%lx size=0x%lx resid=0x%lx " "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache); #endif switch (cache) { case PTE_NOCACHE: default: f1 = 0; f2l = 0; f2s = 0; break; case PTE_CACHE: f1 = pte_l1_s_cache_mode; f2l = pte_l2_l_cache_mode; f2s = pte_l2_s_cache_mode; break; case PTE_PAGETABLE: f1 = pte_l1_s_cache_mode_pt; f2l = pte_l2_l_cache_mode_pt; f2s = pte_l2_s_cache_mode_pt; break; } size = resid; while (resid > 0) { /* See if we can use a section mapping. */ if (L1_S_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("S"); #endif pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL, prot) | f1 | L1_S_DOM(PMAP_DOMAIN_KERNEL); PTE_SYNC(&pde[va >> L1_S_SHIFT]); va += L1_S_SIZE; pa += L1_S_SIZE; resid -= L1_S_SIZE; continue; } /* * Ok, we're going to use an L2 table. Make sure * one is actually in the corresponding L1 slot * for the current VA. */ if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C) panic("pmap_map_chunk: no L2 table for VA 0x%08x", va); #ifndef ARM32_NEW_VM_LAYOUT pte = (pt_entry_t *) kernel_pt_lookup(pde[va >> L1_S_SHIFT] & L2_S_FRAME); #else pte = (pt_entry_t *) kernel_pt_lookup( pde[L1_IDX(va)] & L1_C_ADDR_MASK); #endif if (pte == NULL) panic("pmap_map_chunk: can't find L2 table for VA" "0x%08x", va); /* See if we can use a L2 large page mapping. */ if (L2_L_MAPPABLE_P(va, pa, resid)) { #ifdef VERBOSE_INIT_ARM printf("L"); #endif for (i = 0; i < 16; i++) { #ifndef ARM32_NEW_VM_LAYOUT pte[((va >> PAGE_SHIFT) & 0x3f0) + i] = L2_L_PROTO | pa | L2_L_PROT(PTE_KERNEL, prot) | f2l; PTE_SYNC(&pte[((va >> PAGE_SHIFT) & 0x3f0) + i]); #else pte[l2pte_index(va) + i] = L2_L_PROTO | pa | L2_L_PROT(PTE_KERNEL, prot) | f2l; PTE_SYNC(&pte[l2pte_index(va) + i]); #endif } va += L2_L_SIZE; pa += L2_L_SIZE; resid -= L2_L_SIZE; continue; } /* Use a small page mapping. */ #ifdef VERBOSE_INIT_ARM printf("P"); #endif #ifndef ARM32_NEW_VM_LAYOUT pte[(va >> PAGE_SHIFT) & 0x3ff] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s; PTE_SYNC(&pte[(va >> PAGE_SHIFT) & 0x3ff]); #else pte[l2pte_index(va)] = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s; PTE_SYNC(&pte[l2pte_index(va)]); #endif va += PAGE_SIZE; pa += PAGE_SIZE; resid -= PAGE_SIZE; } #ifdef VERBOSE_INIT_ARM printf("\n"); #endif return (size); } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 132081) +++ head/sys/i386/i386/pmap.c (revision 132082) @@ -1,3097 +1,3098 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) #define CPU_ENABLE_SSE #endif #if defined(CPU_DISABLE_SSE) #undef CPU_ENABLE_SSE #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; #ifdef SMP static struct mtx lazypmap_lock; #endif vm_paddr_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt; vm_offset_t kernel_vm_end; extern u_int32_t KERNend; #ifdef PAE static uma_zone_t pdptzone; #endif /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *CMAP3; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2, CADDR3; static struct mtx CMAPCADDR12_lock; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static caddr_t crashdumpmap; #ifdef SMP extern pt_entry_t *SMPpt; #endif static pt_entry_t *PMAP1 = 0, *PMAP2; static pt_entry_t *PADDR1 = 0, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void pmap_clear_ptes(vm_page_t m, int bit); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static int pmap_unuse_pt(pmap_t, vm_offset_t); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); #ifdef PAE static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); #endif CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) newaddr = (addr + PDRMASK) & ~PDRMASK; #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_paddr_t firstaddr; vm_paddr_t loadaddr; { vm_offset_t va; pt_entry_t *pte, *unused; int i; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); #ifdef SMP mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); #endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) *CMAP3 = 0; mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); virtual_avail = va; *CMAP1 = *CMAP2 = 0; for (i = 0; i < NKPT; i++) PTD[i] = 0; /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ void pmap_set_pg(void) { pd_entry_t pdir; pt_entry_t *pte; vm_offset_t va, endva; int i; if (pgeflag == 0) return; i = KERNLOAD/NBPDR; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir = kernel_pmap->pm_pdir[KPTDI+i]; pdir |= pgeflag; kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; invltlb(); /* Play it safe, invltlb() every time */ i++; va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Play it safe, invltlb() every time */ va += PAGE_SIZE; } } } #ifdef PAE static void * pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { *flags = UMA_SLAB_PRIV; return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0)); } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); #ifdef PAE pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } #ifdef I386_CPU /* * i386 only has "invalidate everything" and no SMP to worry about. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } #else /* !I386_CPU */ #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; if (smp_started) { if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; if (smp_started) { if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; if (smp_started) { if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_tlb_mtx); } else critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active * XXX critical sections disable interrupts again */ if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } if (smp_started) mtx_unlock_spin(&smp_tlb_mtx); else critical_exit(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } #endif /* !SMP */ #endif /* !I386_CPU */ /* * Are we current address space or kernel? N.B. We return FALSE when * a pmap's page table is in use because a kernel thread is borrowing * it. The borrowed page table can change spontaneously, making any * dependence on its continued use subject to a race condition. */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); } /* * If the given pmap is not the current pmap, Giant must be held. */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); GIANT_REQUIRED; newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static __inline void invlcaddr(void *caddr) { #ifdef I386_CPU invltlb(); #else invlpg((u_int)caddr); #endif } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, vm_page_queue_mtx * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_assert(&vm_page_queue_mtx, MA_OWNED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; if (pmap == NULL) return (rtval); PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) { rtval = (pde & ~PDRMASK) | (va & PDRMASK); PMAP_UNLOCK(pmap); return rtval; } pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; m = NULL; if (pmap == NULL) return (m); vm_page_lock_queues(); PMAP_LOCK(pmap); pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) | (va & PDRMASK)); vm_page_hold(m); } } else { sched_pin(); pte = *pmap_pte_quick(pmap, va); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } sched_unpin(); } } vm_page_unlock_queues(); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * We never unwire a kernel page table page, making a * check for the kernel_pmap unnecessary. */ if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) { /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); } /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return 0; ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return pmap_unwire_pte_hold(pmap, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_active = 0; PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; static int color; int i; PMAP_LOCK_INIT(pmap); /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, NBPTD); #ifdef PAE pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif } /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, color++, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) { if ((ptdpg[i]->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); } mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); #ifdef SMP pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; #endif /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #ifdef PAE pmap->pm_pdpt[i] = pa | PG_V; #endif } pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { VM_WAIT; /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { unsigned ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; pmap_invalidate_all(kernel_pmap); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa); m->hold_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex); if (m == NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ #ifdef SMP /* * Deal with a SMP shootdown of other users of the pmap that we are * trying to dispose of. This can be a bit hairy. */ static u_int *lazymask; static u_int lazyptd; static volatile u_int lazywait; void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { u_int mymask = PCPU_GET(cpumask); if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); atomic_store_rel_int(&lazywait, 1); } static void pmap_lazyfix_self(u_int mymask) { if (rcr3() == lazyptd) load_cr3(PCPU_GET(curpcb)->pcb_cr3); atomic_clear_int(lazymask, mymask); } static void pmap_lazyfix(pmap_t pmap) { u_int mymask = PCPU_GET(cpumask); u_int mask; register u_int spins; while ((mask = pmap->pm_active) != 0) { spins = 50000000; mask = mask & -mask; /* Find least significant set bit */ mtx_lock_spin(&lazypmap_lock); #ifdef PAE lazyptd = vtophys(pmap->pm_pdpt); #else lazyptd = vtophys(pmap->pm_pdir); #endif if (mask == mymask) { lazymask = &pmap->pm_active; pmap_lazyfix_self(mymask); } else { atomic_store_rel_int((u_int *)&lazymask, (u_int)&pmap->pm_active); atomic_store_rel_int(&lazywait, 0); ipi_selected(mask, IPI_LAZYPMAP); while (lazywait == 0) { ia32_pause(); if (--spins == 0) break; } } mtx_unlock_spin(&lazypmap_lock); if (spins == 0) printf("pmap_lazyfix: spun for 50000000\n"); } } #else /* SMP */ /* * Cleaning up on uniprocessor is easy. For various reasons, we're * unlikely to have to even execute this code, including the fact * that the cleanup is deferred until the parent does a wait(2), which * means that another userland process has run. */ static void pmap_lazyfix(pmap_t pmap) { u_int cr3; cr3 = vtophys(pmap->pm_pdir); if (cr3 == rcr3()) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); pmap->pm_active &= ~(PCPU_GET(cpumask)); } } #endif /* SMP */ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); #ifdef SMP pmap->pm_pdir[MPPTDI] = 0; #endif pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); vm_page_lock_queues(); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #ifdef PAE KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_free_zero(m); } vm_page_unlock_queues(); PMAP_LOCK_DESTROY(pmap); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; pt_entry_t *pde; mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, kernel_vm_end); pde_store(pde, newpdir); } mtx_unlock_spin(&allpmaps_lock); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; vm_page_lock_queues(); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; vm_page_unlock_queues(); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va); } } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; int anyvalid; if (pmap == NULL) return; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); PMAP_UNLOCK(pmap); return; } anyvalid = 0; for (; sva < eva; sva = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { if ((pte = pmap_pte(pmap, sva)) == NULL || *pte == 0) continue; anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva)) break; } } if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); sched_pin(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { PMAP_LOCK(pv->pv_pmap); pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va); PMAP_UNLOCK(pv->pv_pmap); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { unsigned pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged = 1; continue; } if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { pt_entry_t pbits; pt_entry_t *pte; vm_page_t m; if ((pte = pmap_pte(pmap, sva)) == NULL) continue; pbits = *pte; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0 && pmap_track_modified(sva)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits); vm_page_dirty(m); pbits &= ~PG_M; } } pbits &= ~PG_RW; if (pbits != *pte) { pte_store(pte, pbits); anychanged = 1; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; register pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) { /* * Yes, I know this will truncate upper address bits for PAE, * but I'm actually more interested in the lower bits */ printf("pmap_enter: va %p, pte %p, origpte %p\n", (void *)va, (void *)pte, (void *)(uintptr_t)origpte); panic("pmap_enter: attempted pmap_enter on 4MB page"); } /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); PMAP_LOCK(pmap); err = pmap_remove_pte(pmap, pte, va); PMAP_UNLOCK(pmap); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | PG_V); if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { pte_store(pte, newpte | PG_A); /*if (origpte)*/ { pmap_invalidate_page(pmap, va); } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); mpte = PHYS_TO_VM_PAGE(ptepa); mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); if (mpte == NULL) goto retry; } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, mpte); vm_page_unlock_queues(); } return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); #ifndef I386_CPU invlpg(va); #else invltlb(); #endif return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { vm_page_t p; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); if (pseflag && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; pd_entry_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) return; retry: p = vm_page_lookup(object, pindex); if (p != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(&pmap->pm_pdir[ptepindex], ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } pmap_invalidate_all(pmap); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; vm_page_t m; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = (addr + NBPDR) & ~PDRMASK; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); dst_pte = pmap_pte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, m); } else { vm_page_lock_queues(); pmap_unwire_pte_hold(dst_pmap, dstmpte); vm_page_unlock_queues(); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; } } } static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { #if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else #endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(CADDR2); pagezero(CADDR2); *CMAP2 = 0; sched_unpin(); mtx_unlock(&CMAPCADDR12_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); sched_pin(); *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(CADDR2); if (off == 0 && size == PAGE_SIZE) pagezero(CADDR2); else bzero((char *)CADDR2 + off, size); *CMAP2 = 0; sched_unpin(); mtx_unlock(&CMAPCADDR12_lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); sched_pin(); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; invlcaddr(CADDR3); pagezero(CADDR3); *CMAP3 = 0; sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*CMAP2) panic("pmap_copy_page: CMAP2 busy"); sched_pin(); #ifdef I386_CPU invltlb(); #else invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); #endif *CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; bcopy(CADDR1, CADDR2, PAGE_SIZE); *CMAP1 = 0; *CMAP2 = 0; sched_unpin(); mtx_unlock(&CMAPCADDR12_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { return TRUE; } loops++; if (loops >= 16) break; } return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif - mtx_assert(&vm_page_queue_mtx, MA_OWNED); + vm_page_lock_queues(); PMAP_LOCK(pmap); sched_pin(); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte_quick(pmap, pv->pv_va); #endif tpte = *pte; if (tpte == 0) { printf("TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } m = PHYS_TO_VM_PAGE(tpte); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_EMPTY(&m->md.pv_list)) vm_page_flag_clear(m, PG_WRITEABLE); pmap_unuse_pt(pmap, pv->pv_va); free_pv_entry(pv); } sched_unpin(); pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); + vm_page_unlock_queues(); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; boolean_t rv; rv = FALSE; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rv); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); rv = (*pte & PG_M) != 0; PMAP_UNLOCK(pv->pv_pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); if (*pmap_pde(pmap, addr)) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * Clear the given bit in each of the given page's ptes. */ static __inline void pmap_clear_ptes(vm_page_t m, int bit) { register pv_entry_t pv; pt_entry_t pbits, *pte; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) return; sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (bit == PG_RW) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif PMAP_LOCK(pv->pv_pmap); pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); pbits = *pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } pte_store(pte, pbits & ~(PG_M|PG_RW)); } else { pte_store(pte, pbits & ~bit); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } PMAP_UNLOCK(pv->pv_pmap); } if (bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); sched_unpin(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_clear_ptes(m, PG_RW); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; pt_entry_t *pte; pt_entry_t v; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); sched_pin(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; PMAP_LOCK(pv->pv_pmap); pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { PMAP_UNLOCK(pv->pv_pmap); break; } } PMAP_UNLOCK(pv->pv_pmap); } while ((pv = pvn) != NULL && pv != pvf); } sched_unpin(); return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_clear_ptes(m, PG_M); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_clear_ptes(m, PG_A); } /* * Miscellaneous support routines follow */ /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_paddr_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else va = kmem_alloc_nofault(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0; ) { pmap_kenter(tmpva, pa); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset, tmpva; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; base = va & PG_FRAME; offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) pmap_kremove(tmpva); pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; PMAP_LOCK(pmap); ptep = pmap_pte(pmap, addr); pte = (ptep != NULL) ? *ptep : 0; PMAP_UNLOCK(pmap); if (pte != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { struct proc *p = td->td_proc; pmap_t pmap, oldpmap; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); #if defined(SMP) atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else oldpmap->pm_active &= ~1; pmap->pm_active |= 1; #endif #ifdef PAE cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* XXXKSE this is wrong. * pmap_activate is for the current thread on the current cpu */ if (p->p_flag & P_SA) { /* Make sure all other cr3 entries are updated. */ /* what if they are running? XXXKSE (maybe abort them) */ FOREACH_THREAD_IN_PROC(p, td) { td->td_pcb->pcb_cr3 = cr3; } } else { td->td_pcb->pcb_cr3 = cr3; } load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + PDRMASK) & ~PDRMASK; return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { int i, j; vm_paddr_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPTD; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *ptep); }; } void pmap_pvdump(pa) vm_paddr_t pa; { pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/ia64/ia64/pmap.c =================================================================== --- head/sys/ia64/ia64/pmap.c (revision 132081) +++ head/sys/ia64/ia64/pmap.c (revision 132082) @@ -1,2388 +1,2387 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 1998,2000 Doug Rabson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * from: i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp * with some ideas from NetBSD's alpha pmap */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ /* * Following the Linux model, region IDs are allocated in groups of * eight so that a single region ID can be used for as many RRs as we * want by encoding the RR number into the low bits of the ID. * * We reserve region ID 0 for the kernel and allocate the remaining * IDs for user pmaps. * * Region 0..4 * User virtually mapped * * Region 5 * Kernel virtually mapped * * Region 6 * Kernel physically mapped uncacheable * * Region 7 * Kernel physically mapped cacheable */ /* XXX move to a header. */ extern u_int64_t ia64_gateway_page[]; MALLOC_DEFINE(M_PMAP, "PMAP", "PMAP Structures"); #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 /* Preallocate at least this many */ #if 0 #define PMAP_DIAGNOSTIC #define PMAP_DEBUG #endif #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pte_w(pte) ((pte)->pte_ig & PTE_IG_WIRED) #define pmap_pte_managed(pte) ((pte)->pte_ig & PTE_IG_MANAGED) #define pmap_pte_v(pte) ((pte)->pte_p) #define pmap_pte_pa(pte) (((pte)->pte_ppn) << 12) #define pmap_pte_prot(pte) (((pte)->pte_ar << 2) | (pte)->pte_pl) #define pmap_pte_set_w(pte, v) ((v)?((pte)->pte_ig |= PTE_IG_WIRED) \ :((pte)->pte_ig &= ~PTE_IG_WIRED)) #define pmap_pte_set_prot(pte, v) do { \ (pte)->pte_ar = v >> 2; \ (pte)->pte_pl = v & 3; \ } while (0) /* * Given a map and a machine independent protection code, * convert to an ia64 protection code. */ #define pte_prot(m, p) (protection_codes[m == kernel_pmap ? 0 : 1][p]) #define pte_prot_pl(m, p) (pte_prot(m, p) & 3) #define pte_prot_ar(m, p) (pte_prot(m, p) >> 2) int protection_codes[2][8]; /* * Return non-zero if this pmap is currently active */ #define pmap_isactive(pmap) (pmap->pm_active) /* * Statically allocated kernel pmap */ struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ vm_offset_t vhpt_base, vhpt_size; struct mtx pmap_vhptmutex; /* * We use an object to own the kernel's 'page tables'. For simplicity, * we use one page directory to index a set of pages containing * ia64_lptes. This gives us up to 2Gb of kernel virtual space. */ static int nkpt; struct ia64_lpte **ia64_kptdir; #define KPTE_DIR_INDEX(va) \ ((va >> (2*PAGE_SHIFT-5)) & ((1<<(PAGE_SHIFT-3))-1)) #define KPTE_PTE_INDEX(va) \ ((va >> PAGE_SHIFT) & ((1<<(PAGE_SHIFT-5))-1)) #define NKPTEPG (PAGE_SIZE / sizeof(struct ia64_lpte)) vm_offset_t kernel_vm_end; /* Values for ptc.e. XXX values for SKI. */ static u_int64_t pmap_ptc_e_base = 0x100000000; static u_int64_t pmap_ptc_e_count1 = 3; static u_int64_t pmap_ptc_e_count2 = 2; static u_int64_t pmap_ptc_e_stride1 = 0x2000; static u_int64_t pmap_ptc_e_stride2 = 0x100000000; /* * Data for the RID allocator */ static int pmap_ridcount; static int pmap_rididx; static int pmap_ridmapsz; static int pmap_ridmax; static u_int64_t *pmap_ridmap; struct mtx pmap_ridmutex; /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; /* * Data for allocating PTEs for user processes. */ static uma_zone_t ptezone; /* * VHPT instrumentation. */ static int pmap_vhpt_inserts; static int pmap_vhpt_collisions; static int pmap_vhpt_resident; SYSCTL_DECL(_vm_stats); SYSCTL_NODE(_vm_stats, OID_AUTO, vhpt, CTLFLAG_RD, 0, ""); SYSCTL_INT(_vm_stats_vhpt, OID_AUTO, inserts, CTLFLAG_RD, &pmap_vhpt_inserts, 0, ""); SYSCTL_INT(_vm_stats_vhpt, OID_AUTO, collisions, CTLFLAG_RD, &pmap_vhpt_collisions, 0, ""); SYSCTL_INT(_vm_stats_vhpt, OID_AUTO, resident, CTLFLAG_RD, &pmap_vhpt_resident, 0, ""); static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void ia64_protection_init(void); static pmap_t pmap_install(pmap_t); static void pmap_invalidate_all(pmap_t pmap); vm_offset_t pmap_steal_memory(vm_size_t size) { vm_size_t bank_size; vm_offset_t pa, va; size = round_page(size); bank_size = phys_avail[1] - phys_avail[0]; while (size > bank_size) { int i; for (i = 0; phys_avail[i+2]; i+= 2) { phys_avail[i] = phys_avail[i+2]; phys_avail[i+1] = phys_avail[i+3]; } phys_avail[i] = 0; phys_avail[i+1] = 0; if (!phys_avail[0]) panic("pmap_steal_memory: out of memory"); bank_size = phys_avail[1] - phys_avail[0]; } pa = phys_avail[0]; phys_avail[0] += size; va = IA64_PHYS_TO_RR7(pa); bzero((caddr_t) va, size); return va; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap() { int i, j, count, ridbits; struct ia64_pal_result res; /* * Query the PAL Code to find the loop parameters for the * ptc.e instruction. */ res = ia64_call_pal_static(PAL_PTCE_INFO, 0, 0, 0); if (res.pal_status != 0) panic("Can't configure ptc.e parameters"); pmap_ptc_e_base = res.pal_result[0]; pmap_ptc_e_count1 = res.pal_result[1] >> 32; pmap_ptc_e_count2 = res.pal_result[1] & ((1L<<32) - 1); pmap_ptc_e_stride1 = res.pal_result[2] >> 32; pmap_ptc_e_stride2 = res.pal_result[2] & ((1L<<32) - 1); if (bootverbose) printf("ptc.e base=0x%lx, count1=%ld, count2=%ld, " "stride1=0x%lx, stride2=0x%lx\n", pmap_ptc_e_base, pmap_ptc_e_count1, pmap_ptc_e_count2, pmap_ptc_e_stride1, pmap_ptc_e_stride2); /* * Setup RIDs. RIDs 0..7 are reserved for the kernel. * * We currently need at least 19 bits in the RID because PID_MAX * can only be encoded in 17 bits and we need RIDs for 5 regions * per process. With PID_MAX equalling 99999 this means that we * need to be able to encode 499995 (=5*PID_MAX). * The Itanium processor only has 18 bits and the architected * minimum is exactly that. So, we cannot use a PID based scheme * in those cases. Enter pmap_ridmap... * We should avoid the map when running on a processor that has * implemented enough bits. This means that we should pass the * process/thread ID to pmap. This we currently don't do, so we * use the map anyway. However, we don't want to allocate a map * that is large enough to cover the range dictated by the number * of bits in the RID, because that may result in a RID map of * 2MB in size for a 24-bit RID. A 64KB map is enough. * The bottomline: we create a 32KB map when the processor only * implements 18 bits (or when we can't figure it out). Otherwise * we create a 64KB map. */ res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0); if (res.pal_status != 0) { if (bootverbose) printf("Can't read VM Summary - assuming 18 Region ID bits\n"); ridbits = 18; /* guaranteed minimum */ } else { ridbits = (res.pal_result[1] >> 8) & 0xff; if (bootverbose) printf("Processor supports %d Region ID bits\n", ridbits); } if (ridbits > 19) ridbits = 19; pmap_ridmax = (1 << ridbits); pmap_ridmapsz = pmap_ridmax / 64; pmap_ridmap = (u_int64_t *)pmap_steal_memory(pmap_ridmax / 8); pmap_ridmap[0] |= 0xff; pmap_rididx = 0; pmap_ridcount = 8; mtx_init(&pmap_ridmutex, "RID allocator lock", NULL, MTX_DEF); /* * Allocate some memory for initial kernel 'page tables'. */ ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE); for (i = 0; i < NKPT; i++) { ia64_kptdir[i] = (void*)pmap_steal_memory(PAGE_SIZE); } nkpt = NKPT; kernel_vm_end = NKPT * PAGE_SIZE * NKPTEPG + VM_MIN_KERNEL_ADDRESS - VM_GATEWAY_SIZE; for (i = 0; phys_avail[i+2]; i+= 2) ; count = i+2; /* * Figure out a useful size for the VHPT, based on the size of * physical memory and try to locate a region which is large * enough to contain the VHPT (which must be a power of two in * size and aligned to a natural boundary). */ vhpt_size = 15; while ((1< i; j -= 2) { phys_avail[j] = phys_avail[j-2]; phys_avail[j+1] = phys_avail[j-2+1]; } phys_avail[count+2] = 0; phys_avail[count+3] = 0; phys_avail[i+1] = vhpt_base; phys_avail[i+2] = vhpt_base + (1L << vhpt_size); } else { phys_avail[i] = vhpt_base + (1L << vhpt_size); } vhpt_base = IA64_PHYS_TO_RR7(vhpt_base); bzero((void *) vhpt_base, (1L << vhpt_size)); mtx_init(&pmap_vhptmutex, "VHPT collision chain lock", NULL, MTX_DEF); __asm __volatile("mov cr.pta=%0;; srlz.i;;" :: "r" (vhpt_base + (1<<8) + (vhpt_size<<2) + 1)); virtual_avail = VM_MIN_KERNEL_ADDRESS; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ ia64_protection_init(); /* * Initialize the kernel pmap (which is statically allocated). */ for (i = 0; i < 5; i++) kernel_pmap->pm_rid[i] = 0; kernel_pmap->pm_active = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); PCPU_SET(current_pmap, kernel_pmap); /* * Region 5 is mapped via the vhpt. */ ia64_set_rr(IA64_RR_BASE(5), (5 << 8) | (PAGE_SHIFT << 2) | 1); /* * Region 6 is direct mapped UC and region 7 is direct mapped * WC. The details of this is controlled by the Alt {I,D}TLB * handlers. Here we just make sure that they have the largest * possible page size to minimise TLB usage. */ ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (IA64_ID_PAGE_SHIFT << 2)); ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (IA64_ID_PAGE_SHIFT << 2)); /* * Clear out any random TLB entries left over from booting. */ pmap_invalidate_all(kernel_pmap); map_gateway_page(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(void) { int i; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * Init the pv free list and the PTE free list. */ pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(pvzone, MINPV); ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE); uma_prealloc(ptezone, MINPV); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; pv_entry_high_water = 9 * (pv_entry_max / 10); } /*************************************************** * Manipulate TLBs for a pmap ***************************************************/ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("invalidating TLB for non-current pmap")); ia64_ptc_g(va, PAGE_SHIFT << 2); } static void pmap_invalidate_all_1(void *arg) { u_int64_t addr; int i, j; register_t psr; psr = intr_disable(); addr = pmap_ptc_e_base; for (i = 0; i < pmap_ptc_e_count1; i++) { for (j = 0; j < pmap_ptc_e_count2; j++) { ia64_ptc_e(addr); addr += pmap_ptc_e_stride2; } addr += pmap_ptc_e_stride1; } intr_restore(psr); } static void pmap_invalidate_all(pmap_t pmap) { KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("invalidating TLB for non-current pmap")); #ifdef SMP smp_rendezvous(0, pmap_invalidate_all_1, 0, 0); #else pmap_invalidate_all_1(0); #endif } static u_int32_t pmap_allocate_rid(void) { uint64_t bit, bits; int rid; mtx_lock(&pmap_ridmutex); if (pmap_ridcount == pmap_ridmax) panic("pmap_allocate_rid: All Region IDs used"); /* Find an index with a free bit. */ while ((bits = pmap_ridmap[pmap_rididx]) == ~0UL) { pmap_rididx++; if (pmap_rididx == pmap_ridmapsz) pmap_rididx = 0; } rid = pmap_rididx * 64; /* Find a free bit. */ bit = 1UL; while (bits & bit) { rid++; bit <<= 1; } pmap_ridmap[pmap_rididx] |= bit; pmap_ridcount++; mtx_unlock(&pmap_ridmutex); return rid; } static void pmap_free_rid(u_int32_t rid) { uint64_t bit; int idx; idx = rid / 64; bit = ~(1UL << (rid & 63)); mtx_lock(&pmap_ridmutex); pmap_ridmap[idx] &= bit; pmap_ridcount--; mtx_unlock(&pmap_ridmutex); } /*************************************************** * Low level helper routines..... ***************************************************/ /* * Install a pte into the VHPT */ static PMAP_INLINE void pmap_install_pte(struct ia64_lpte *vhpte, struct ia64_lpte *pte) { u_int64_t *vhp, *p; vhp = (u_int64_t *)vhpte; p = (u_int64_t *)pte; critical_enter(); /* Invalidate the tag so the VHPT walker will not match this entry. */ vhp[2] = 1UL << 63; ia64_mf(); vhp[0] = p[0]; vhp[1] = p[1]; ia64_mf(); /* Install a proper tag now that we're done. */ vhp[2] = p[2]; ia64_mf(); critical_exit(); } /* * Compare essential parts of pte. */ static PMAP_INLINE int pmap_equal_pte(struct ia64_lpte *pte1, struct ia64_lpte *pte2) { return *(u_int64_t *) pte1 == *(u_int64_t *) pte2; } /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } /*************************************************** * Page table page management routines..... ***************************************************/ void pmap_pinit0(struct pmap *pmap) { /* kernel_pmap is the same as any other pmap. */ pmap_pinit(pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(struct pmap *pmap) { int i; pmap->pm_flags = 0; for (i = 0; i < 5; i++) pmap->pm_rid[i] = pmap_allocate_rid(); pmap->pm_ptphint = NULL; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { int i; for (i = 0; i < 5; i++) if (pmap->pm_rid[i]) pmap_free_rid(pmap->pm_rid[i]); } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct ia64_lpte *ptepage; vm_page_t nkpg; if (kernel_vm_end >= addr) return; critical_enter(); while (kernel_vm_end < addr) { /* We could handle more by increasing the size of kptdir. */ if (nkpt == MAXKPT) panic("pmap_growkernel: out of kernel address space"); nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); ptepage = (struct ia64_lpte *) IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg)); bzero(ptepage, PAGE_SIZE); ia64_kptdir[KPTE_DIR_INDEX(kernel_vm_end)] = ptepage; nkpt++; kernel_vm_end += PAGE_SIZE * NKPTEPG; } critical_exit(); } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } /* * Add an ia64_lpte to the VHPT. */ static void pmap_enter_vhpt(struct ia64_lpte *pte, vm_offset_t va) { struct ia64_lpte *vhpte; pmap_vhpt_inserts++; pmap_vhpt_resident++; vhpte = (struct ia64_lpte *) ia64_thash(va); if (vhpte->pte_chain) pmap_vhpt_collisions++; mtx_lock(&pmap_vhptmutex); pte->pte_chain = vhpte->pte_chain; ia64_mf(); vhpte->pte_chain = ia64_tpa((vm_offset_t)pte); ia64_mf(); if (!vhpte->pte_p && pte->pte_p) pmap_install_pte(vhpte, pte); mtx_unlock(&pmap_vhptmutex); } /* * Update VHPT after a pte has changed. */ static void pmap_update_vhpt(struct ia64_lpte *pte, vm_offset_t va) { struct ia64_lpte *vhpte; vhpte = (struct ia64_lpte *)ia64_thash(va); mtx_lock(&pmap_vhptmutex); if ((!vhpte->pte_p || vhpte->pte_tag == pte->pte_tag) && pte->pte_p) pmap_install_pte(vhpte, pte); mtx_unlock(&pmap_vhptmutex); } /* * Remove the ia64_lpte matching va from the VHPT. Return zero if it * worked or an appropriate error code otherwise. */ static int pmap_remove_vhpt(vm_offset_t va) { struct ia64_lpte *pte; struct ia64_lpte *lpte; struct ia64_lpte *vhpte; u_int64_t tag; vhpte = (struct ia64_lpte *)ia64_thash(va); /* * If the VHPTE is invalid, there can't be a collision chain. */ if (!vhpte->pte_p) { KASSERT(!vhpte->pte_chain, ("bad vhpte")); return (ENOENT); } lpte = vhpte; tag = ia64_ttag(va); mtx_lock(&pmap_vhptmutex); pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(vhpte->pte_chain); KASSERT(pte != NULL, ("foo")); while (pte->pte_tag != tag) { lpte = pte; if (pte->pte_chain == 0) { mtx_unlock(&pmap_vhptmutex); return (ENOENT); } pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(pte->pte_chain); } /* Snip this pv_entry out of the collision chain. */ lpte->pte_chain = pte->pte_chain; ia64_mf(); /* * If the VHPTE matches as well, change it to map the first * element from the chain if there is one. */ if (vhpte->pte_tag == tag) { if (vhpte->pte_chain) { pte = (void*)IA64_PHYS_TO_RR7(vhpte->pte_chain); pmap_install_pte(vhpte, pte); } else vhpte->pte_p = 0; } mtx_unlock(&pmap_vhptmutex); pmap_vhpt_resident--; return (0); } /* * Find the ia64_lpte for the given va, if any. */ static struct ia64_lpte * pmap_find_vhpt(vm_offset_t va) { struct ia64_lpte *pte; u_int64_t tag; tag = ia64_ttag(va); pte = (struct ia64_lpte *)ia64_thash(va); if (pte->pte_chain == 0) return (NULL); pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(pte->pte_chain); while (pte->pte_tag != tag) { if (pte->pte_chain == 0) return (NULL); pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(pte->pte_chain); } return (pte); } /* * Remove an entry from the list of managed mappings. */ static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va, pv_entry_t pv) { if (!pv) { if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } } if (pv) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); return 0; } else { return ENOENT; } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; pv = get_pv_entry(); pv->pv_pmap = pmap; pv->pv_va = va; vm_page_lock_queues(); TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; vm_page_unlock_queues(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { struct ia64_lpte *pte; pmap_t oldpmap; if (!pmap) return 0; oldpmap = pmap_install(pmap); pte = pmap_find_vhpt(va); pmap_install(oldpmap); if (!pte) return 0; return pmap_pte_pa(pte); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa; vm_page_t m; m = NULL; mtx_lock(&Giant); if ((pa = pmap_extract(pmap, va)) != 0) { m = PHYS_TO_VM_PAGE(pa); vm_page_lock_queues(); vm_page_hold(m); vm_page_unlock_queues(); } mtx_unlock(&Giant); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Find the kernel lpte for mapping the given virtual address, which * must be in the part of region 5 which we can cover with our kernel * 'page tables'. */ static struct ia64_lpte * pmap_find_kpte(vm_offset_t va) { KASSERT((va >> 61) == 5, ("kernel mapping 0x%lx not in region 5", va)); KASSERT(IA64_RR_MASK(va) < (nkpt * PAGE_SIZE * NKPTEPG), ("kernel mapping 0x%lx out of range", va)); return (&ia64_kptdir[KPTE_DIR_INDEX(va)][KPTE_PTE_INDEX(va)]); } /* * Find a pte suitable for mapping a user-space address. If one exists * in the VHPT, that one will be returned, otherwise a new pte is * allocated. */ static struct ia64_lpte * pmap_find_pte(vm_offset_t va) { struct ia64_lpte *pte; if (va >= VM_MAXUSER_ADDRESS) return pmap_find_kpte(va); pte = pmap_find_vhpt(va); if (!pte) { pte = uma_zalloc(ptezone, M_WAITOK); pte->pte_p = 0; } return pte; } /* * Free a pte which is now unused. This simply returns it to the zone * allocator if it is a user mapping. For kernel mappings, clear the * valid bit to make it clear that the mapping is not currently used. */ static void pmap_free_pte(struct ia64_lpte *pte, vm_offset_t va) { if (va < VM_MAXUSER_ADDRESS) uma_zfree(ptezone, pte); else pte->pte_p = 0; } /* * Set a pte to contain a valid mapping and enter it in the VHPT. If * the pte was orginally valid, then its assumed to already be in the * VHPT. */ static void pmap_set_pte(struct ia64_lpte *pte, vm_offset_t va, vm_offset_t pa, int ig, int pl, int ar) { int wasvalid = pte->pte_p; pte->pte_p = 1; pte->pte_ma = PTE_MA_WB; if (ig & PTE_IG_MANAGED) { pte->pte_a = 0; pte->pte_d = 0; } else { pte->pte_a = 1; pte->pte_d = 1; } pte->pte_pl = pl; pte->pte_ar = ar; pte->pte_ppn = pa >> 12; pte->pte_ed = 0; pte->pte_ig = ig; pte->pte_ps = PAGE_SHIFT; pte->pte_key = 0; pte->pte_tag = ia64_ttag(va); if (wasvalid) { pmap_update_vhpt(pte, va); } else { pmap_enter_vhpt(pte, va); } } /* * If a pte contains a valid mapping, clear it and update the VHPT. */ static void pmap_clear_pte(struct ia64_lpte *pte, vm_offset_t va) { if (pte->pte_p) { pmap_remove_vhpt(va); ia64_ptc_g(va, PAGE_SHIFT << 2); pte->pte_p = 0; } } /* * Remove the (possibly managed) mapping represented by pte from the * given pmap. */ static int pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte, vm_offset_t va, pv_entry_t pv, int freepte) { int error; vm_page_t m; KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("removing pte for non-current pmap")); /* * First remove from the VHPT. */ error = pmap_remove_vhpt(va); if (error) return error; /* * Make sure pmap_set_pte() knows it isn't in the VHPT. */ pte->pte_p = 0; if (pte->pte_ig & PTE_IG_WIRED) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte->pte_ig & PTE_IG_MANAGED) { m = PHYS_TO_VM_PAGE(pmap_pte_pa(pte)); if (pte->pte_d) if (pmap_track_modified(va)) vm_page_dirty(m); if (pte->pte_a) vm_page_flag_set(m, PG_REFERENCED); if (freepte) pmap_free_pte(pte, va); return pmap_remove_entry(pmap, m, va, pv); } else { if (freepte) pmap_free_pte(pte, va); return 0; } } /* * Extract the physical page address associated with a kernel * virtual address. */ vm_paddr_t pmap_kextract(vm_offset_t va) { struct ia64_lpte *pte; vm_offset_t gwpage; KASSERT(va >= IA64_RR_BASE(5), ("Must be kernel VA")); /* Regions 6 and 7 are direct mapped. */ if (va >= IA64_RR_BASE(6)) return (IA64_RR_MASK(va)); /* EPC gateway page? */ gwpage = (vm_offset_t)ia64_get_k5(); if (va >= gwpage && va < gwpage + VM_GATEWAY_SIZE) return (IA64_RR_MASK((vm_offset_t)ia64_gateway_page)); /* Bail out if the virtual address is beyond our limits. */ if (IA64_RR_MASK(va) >= nkpt * PAGE_SIZE * NKPTEPG) return (0); pte = pmap_find_kpte(va); if (!pte->pte_p) return (0); return ((pte->pte_ppn << 12) | (va & PAGE_MASK)); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(vm_offset_t va, vm_page_t *m, int count) { int i; struct ia64_lpte *pte; for (i = 0; i < count; i++) { vm_offset_t tva = va + i * PAGE_SIZE; int wasvalid; pte = pmap_find_kpte(tva); wasvalid = pte->pte_p; pmap_set_pte(pte, tva, VM_PAGE_TO_PHYS(m[i]), 0, PTE_PL_KERN, PTE_AR_RWX); if (wasvalid) ia64_ptc_g(tva, PAGE_SHIFT << 2); } } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t va, int count) { int i; struct ia64_lpte *pte; for (i = 0; i < count; i++) { pte = pmap_find_kpte(va); pmap_clear_pte(pte, va); va += PAGE_SIZE; } } /* * Add a wired page to the kva. */ void pmap_kenter(vm_offset_t va, vm_offset_t pa) { struct ia64_lpte *pte; int wasvalid; pte = pmap_find_kpte(va); wasvalid = pte->pte_p; pmap_set_pte(pte, va, pa, 0, PTE_PL_KERN, PTE_AR_RWX); if (wasvalid) ia64_ptc_g(va, PAGE_SHIFT << 2); } /* * Remove a page from the kva */ void pmap_kremove(vm_offset_t va) { struct ia64_lpte *pte; pte = pmap_find_kpte(va); pmap_clear_pte(pte, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot) { return IA64_PHYS_TO_RR7(start); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { struct ia64_lpte *pte; KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)), ("removing page for non-current pmap")); pte = pmap_find_vhpt(va); if (pte) { pmap_remove_pte(pmap, pte, va, 0, 1); pmap_invalidate_page(pmap, va); } return; } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_t oldpmap; vm_offset_t va; pv_entry_t pv; struct ia64_lpte *pte; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; oldpmap = pmap_install(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pmap_remove_page(pmap, sva); pmap_install(oldpmap); return; } if (pmap->pm_stats.resident_count < ((eva - sva) >> PAGE_SHIFT)) { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { va = pv->pv_va; if (va >= sva && va < eva) { pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); pmap_remove_pte(pmap, pte, va, pv, 1); pmap_invalidate_page(pmap, va); } } } else { for (va = sva; va < eva; va = va += PAGE_SIZE) { pte = pmap_find_vhpt(va); if (pte) { pmap_remove_pte(pmap, pte, va, 0, 1); pmap_invalidate_page(pmap, va); } } } pmap_install(oldpmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pmap_t oldpmap; pv_entry_t pv; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m)); } #endif s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { struct ia64_lpte *pte; pmap_t pmap = pv->pv_pmap; vm_offset_t va = pv->pv_va; oldpmap = pmap_install(pmap); pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); if (pmap_pte_pa(pte) != VM_PAGE_TO_PHYS(m)) panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m)); pmap_remove_pte(pmap, pte, va, pv, 1); pmap_invalidate_page(pmap, va); pmap_install(oldpmap); } vm_page_flag_clear(m, PG_WRITEABLE); splx(s); return; } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pmap_t oldpmap; struct ia64_lpte *pte; int newprot; if (pmap == NULL) return; oldpmap = pmap_install(pmap); if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); pmap_install(oldpmap); return; } if (prot & VM_PROT_WRITE) { pmap_install(oldpmap); return; } newprot = pte_prot(pmap, prot); if ((sva & PAGE_MASK) || (eva & PAGE_MASK)) panic("pmap_protect: unaligned addresses"); while (sva < eva) { /* * If page is invalid, skip this page */ pte = pmap_find_vhpt(sva); if (!pte) { sva += PAGE_SIZE; continue; } if (pmap_pte_prot(pte) != newprot) { if (pte->pte_ig & PTE_IG_MANAGED) { vm_offset_t pa = pmap_pte_pa(pte); vm_page_t m = PHYS_TO_VM_PAGE(pa); if (pte->pte_d) { if (pmap_track_modified(sva)) vm_page_dirty(m); pte->pte_d = 0; } if (pte->pte_a) { vm_page_flag_set(m, PG_REFERENCED); pte->pte_a = 0; } } pmap_pte_set_prot(pte, newprot); pmap_update_vhpt(pte, sva); pmap_invalidate_page(pmap, sva); } sva += PAGE_SIZE; } pmap_install(oldpmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { pmap_t oldpmap; vm_offset_t pa; vm_offset_t opa; struct ia64_lpte origpte; struct ia64_lpte *pte; int managed; if (pmap == NULL) return; oldpmap = pmap_install(pmap); va &= ~PAGE_MASK; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); #endif /* * Find (or create) a pte for the given mapping. */ pte = pmap_find_pte(va); origpte = *pte; if (origpte.pte_p) opa = pmap_pte_pa(&origpte); else opa = 0; managed = 0; pa = VM_PAGE_TO_PHYS(m) & ~PAGE_MASK; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte.pte_p && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte.pte_ig & PTE_IG_WIRED) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte.pte_ig & PTE_IG_WIRED)) pmap->pm_stats.wired_count--; /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte.pte_ig & PTE_IG_MANAGED) { if (origpte.pte_d && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } } managed = origpte.pte_ig & PTE_IG_MANAGED; goto validate; } /* * Mapping has changed, invalidate old range and fall * through to handle validating new mapping. */ if (opa) { int error; vm_page_lock_queues(); error = pmap_remove_pte(pmap, pte, va, 0, 0); vm_page_unlock_queues(); if (error) panic("pmap_enter: pte vanished, va: 0x%lx", va); } /* * Enter on the PV list if part of our managed memory. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, m); managed |= PTE_IG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. This * adds the pte to the VHPT if necessary. */ pmap_set_pte(pte, va, pa, managed | (wired ? PTE_IG_WIRED : 0), pte_prot_pl(pmap, prot), pte_prot_ar(pmap, prot)); /* * if the mapping or permission bits are different, we need * to invalidate the page. */ if (!pmap_equal_pte(&origpte, pte)) pmap_invalidate_page(pmap, va); pmap_install(oldpmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { struct ia64_lpte *pte; pmap_t oldpmap; int managed; oldpmap = pmap_install(pmap); pte = pmap_find_pte(va); if (pte->pte_p) goto reinstall; managed = 0; /* * Enter on the PV list since its part of our managed memory. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, m); managed |= PTE_IG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Initialise PTE with read-only protection and enter into VHPT. */ pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m), managed, PTE_PL_USER, PTE_AR_R); reinstall: pmap_install(oldpmap); return (NULL); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); KASSERT(object->type == OBJT_DEVICE, ("pmap_object_init_pt: non-device object")); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { pmap_t oldpmap; struct ia64_lpte *pte; if (pmap == NULL) return; oldpmap = pmap_install(pmap); pte = pmap_find_vhpt(va); KASSERT(pte != NULL, ("pte")); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); pmap_install(oldpmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_zero_page_area zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. * * off and size must reside within a single page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((char *)(caddr_t)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by * mapping it into virtual memory and using bzero to clear * its contents. This is for the vm_idlezero process. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m)); bzero((caddr_t) va, PAGE_SIZE); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(mdst)); bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { pv_entry_t pv; int loops = 0; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); /* * Not found, check current mappings returning immediately if found. */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } loops++; if (loops >= 16) break; } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pv_entry_t pv, npv; - int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif - s = splvm(); + vm_page_lock_queues(); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { struct ia64_lpte *pte; npv = TAILQ_NEXT(pv, pv_plist); if (pv->pv_va >= eva || pv->pv_va < sva) { continue; } pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pte->pte_ig & PTE_IG_WIRED) continue; pmap_remove_pte(pmap, pte, pv->pv_va, pv, 1); } - splx(s); pmap_invalidate_all(pmap); + vm_page_unlock_queues(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { pv_entry_t pv; if ((prot & VM_PROT_WRITE) != 0) return; if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { if ((m->flags & PG_WRITEABLE) == 0) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { int newprot = pte_prot(pv->pv_pmap, prot); pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte; pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); pmap_pte_set_prot(pte, newprot); pmap_update_vhpt(pte, pv->pv_va); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); pmap_install(oldpmap); } vm_page_flag_clear(m, PG_WRITEABLE); } else { pmap_remove_all(m); } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv; int count = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte; pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pte->pte_a) { count++; pte->pte_a = 0; pmap_update_vhpt(pte, pv->pv_va); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_install(oldpmap); } return count; } #if 0 /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t pmap_is_referenced(vm_page_t m) { pv_entry_t pv; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte = pmap_find_vhpt(pv->pv_va); pmap_install(oldpmap); KASSERT(pte != NULL, ("pte")); if (pte->pte_a) return 1; } return 0; } #endif /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte = pmap_find_vhpt(pv->pv_va); pmap_install(oldpmap); KASSERT(pte != NULL, ("pte")); if (pte->pte_d) return 1; } return 0; } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { struct ia64_lpte *pte; pte = pmap_find_vhpt(addr); if (pte && pte->pte_p) return (FALSE); return (TRUE); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pv_entry_t pv; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pte->pte_d) { pte->pte_d = 0; pmap_update_vhpt(pte, pv->pv_va); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_install(oldpmap); } } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pv_entry_t pv; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return; TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { pmap_t oldpmap = pmap_install(pv->pv_pmap); struct ia64_lpte *pte = pmap_find_vhpt(pv->pv_va); KASSERT(pte != NULL, ("pte")); if (pte->pte_a) { pte->pte_a = 0; pmap_update_vhpt(pte, pv->pv_va); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } pmap_install(oldpmap); } } /* * Miscellaneous support routines follow */ static void ia64_protection_init() { int prot, *kp, *up; kp = protection_codes[0]; up = protection_codes[1]; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: *kp++ = (PTE_AR_R << 2) | PTE_PL_KERN; *up++ = (PTE_AR_R << 2) | PTE_PL_KERN; break; case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = (PTE_AR_X_RX << 2) | PTE_PL_KERN; *up++ = (PTE_AR_X_RX << 2) | PTE_PL_USER; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = (PTE_AR_RW << 2) | PTE_PL_KERN; *up++ = (PTE_AR_RW << 2) | PTE_PL_USER; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = (PTE_AR_RWX << 2) | PTE_PL_KERN; *up++ = (PTE_AR_RWX << 2) | PTE_PL_USER; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: *kp++ = (PTE_AR_R << 2) | PTE_PL_KERN; *up++ = (PTE_AR_R << 2) | PTE_PL_USER; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = (PTE_AR_RX << 2) | PTE_PL_KERN; *up++ = (PTE_AR_RX << 2) | PTE_PL_USER; break; case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = (PTE_AR_RW << 2) | PTE_PL_KERN; *up++ = (PTE_AR_RW << 2) | PTE_PL_USER; break; case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = (PTE_AR_RWX << 2) | PTE_PL_KERN; *up++ = (PTE_AR_RWX << 2) | PTE_PL_USER; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(vm_offset_t pa, vm_size_t size) { return (void*) IA64_PHYS_TO_RR6(pa); } /* * 'Unmap' a range mapped by pmap_mapdev(). */ void pmap_unmapdev(vm_offset_t va, vm_size_t size) { return; } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr) { pmap_t oldpmap; struct ia64_lpte *pte; int val = 0; oldpmap = pmap_install(pmap); pte = pmap_find_vhpt(addr); pmap_install(oldpmap); if (!pte) return 0; if (pmap_pte_v(pte)) { vm_page_t m; vm_offset_t pa; val = MINCORE_INCORE; if ((pte->pte_ig & PTE_IG_MANAGED) == 0) return val; pa = pmap_pte_pa(pte); m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte->pte_d) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone */ vm_page_lock_queues(); if (pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte->pte_a) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone */ vm_page_lock_queues(); if (pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { pmap_install(vmspace_pmap(td->td_proc->p_vmspace)); } pmap_t pmap_switch(pmap_t pm) { pmap_t prevpm; int i; mtx_assert(&sched_lock, MA_OWNED); prevpm = PCPU_GET(current_pmap); if (prevpm == pm) return (prevpm); if (prevpm != NULL) atomic_clear_32(&prevpm->pm_active, PCPU_GET(cpumask)); if (pm == NULL) { for (i = 0; i < 5; i++) { ia64_set_rr(IA64_RR_BASE(i), (i << 8)|(PAGE_SHIFT << 2)|1); } } else { for (i = 0; i < 5; i++) { ia64_set_rr(IA64_RR_BASE(i), (pm->pm_rid[i] << 8)|(PAGE_SHIFT << 2)|1); } atomic_set_32(&pm->pm_active, PCPU_GET(cpumask)); } PCPU_SET(current_pmap, pm); __asm __volatile("srlz.d"); return (prevpm); } static pmap_t pmap_install(pmap_t pm) { pmap_t prevpm; mtx_lock_spin(&sched_lock); prevpm = pmap_switch(pm); mtx_unlock_spin(&sched_lock); return (prevpm); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { return addr; } #include "opt_ddb.h" #ifdef DDB #include static const char* psnames[] = { "1B", "2B", "4B", "8B", "16B", "32B", "64B", "128B", "256B", "512B", "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M", "1G", "2G" }; static void print_trs(int type) { struct ia64_pal_result res; int i, maxtr; struct { struct ia64_pte pte; struct ia64_itir itir; struct ia64_ifa ifa; struct ia64_rr rr; } buf; static const char* manames[] = { "WB", "bad", "bad", "bad", "UC", "UCE", "WC", "NaT", }; res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0); if (res.pal_status != 0) { db_printf("Can't get VM summary\n"); return; } if (type == 0) maxtr = (res.pal_result[0] >> 40) & 0xff; else maxtr = (res.pal_result[0] >> 32) & 0xff; db_printf("V RID Virtual Page Physical Page PgSz ED AR PL D A MA P KEY\n"); for (i = 0; i <= maxtr; i++) { bzero(&buf, sizeof(buf)); res = ia64_call_pal_stacked_physical (PAL_VM_TR_READ, i, type, ia64_tpa((u_int64_t) &buf)); if (!(res.pal_result[0] & 1)) buf.pte.pte_ar = 0; if (!(res.pal_result[0] & 2)) buf.pte.pte_pl = 0; if (!(res.pal_result[0] & 4)) buf.pte.pte_d = 0; if (!(res.pal_result[0] & 8)) buf.pte.pte_ma = 0; db_printf( "%d %06x %013lx %013lx %4s %d %d %d %d %d %-3s %d %06x\n", buf.ifa.ifa_ig & 1, buf.rr.rr_rid, buf.ifa.ifa_vpn, buf.pte.pte_ppn, psnames[buf.itir.itir_ps], buf.pte.pte_ed, buf.pte.pte_ar, buf.pte.pte_pl, buf.pte.pte_d, buf.pte.pte_a, manames[buf.pte.pte_ma], buf.pte.pte_p, buf.itir.itir_key); } } DB_COMMAND(itr, db_itr) { print_trs(0); } DB_COMMAND(dtr, db_dtr) { print_trs(1); } DB_COMMAND(rr, db_rr) { int i; u_int64_t t; struct ia64_rr rr; printf("RR RID PgSz VE\n"); for (i = 0; i < 8; i++) { __asm __volatile ("mov %0=rr[%1]" : "=r"(t) : "r"(IA64_RR_BASE(i))); *(u_int64_t *) &rr = t; printf("%d %06x %4s %d\n", i, rr.rr_rid, psnames[rr.rr_ps], rr.rr_ve); } } DB_COMMAND(thash, db_thash) { if (!have_addr) return; db_printf("%p\n", (void *) ia64_thash(addr)); } DB_COMMAND(ttag, db_ttag) { if (!have_addr) return; db_printf("0x%lx\n", ia64_ttag(addr)); } #endif Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 132081) +++ head/sys/kern/kern_exec.c (revision 132082) @@ -1,1218 +1,1216 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int kern_execve(struct thread *td, char *fname, char **argv, char **envv, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #if defined(__amd64__) || defined(__ia64__) if (req->oldlen == sizeof(unsigned int)) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #if defined(__amd64__) || defined(__ia64__) if (req->oldlen == sizeof(unsigned int)) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * MPSAFE */ int execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif /* * MPSAFE */ int __mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC return (kern_execve(td, uap->fname, uap->argv, uap->envv, uap->mac_p)); #else return (ENOSYS); #endif } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. * * MPSAFE */ static int kern_execve(td, fname, argv, envv, mac_p) struct thread *td; char *fname; char **argv; char **envv; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL; int credential_changing; int textset; #ifdef MAC struct label *interplabel = NULL; int will_transition; #endif imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); if (p->p_flag & P_SA || p->p_numthreads > 1) { if (thread_single(SINGLE_EXIT)) { PROC_UNLOCK(p); mtx_unlock(&Giant); return (ERESTART); /* Try again later. */ } /* * If we get here all other threads are dead, * so unset the associated flags and lose KSE mode. */ p->p_flag &= ~P_SA; td->td_mailbox = NULL; td->td_pflags &= ~TDP_SA; thread_single_end(); } p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->userspace_argv = argv; imgp->userspace_envv = envv; imgp->execlabel = NULL; imgp->attr = &attr; imgp->argc = imgp->envc = 0; imgp->argv0 = NULL; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) { mtx_lock(&Giant); goto exec_fail; } #endif /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX); if (imgp->stringbase == NULL) { error = ENOMEM; mtx_lock(&Giant); goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; imgp->image_header = NULL; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, fname, td); mtx_lock(&Giant); interpret: error = namei(ndp); if (error) { kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); goto exec_fail; } imgp->vp = ndp->ni_vp; imgp->fname = fname; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) imgp->vp->v_vflag &= ~VV_TEXT; error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); #ifdef MAC interplabel = mac_vnode_label_alloc(); mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel); #endif vput(ndp->ni_vp); vm_object_deallocate(imgp->object); imgp->object = NULL; /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ FILEDESC_LOCK(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; tmp = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_UNLOCK(p->p_fd); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->endargs - imgp->stringbase; if (ps_arg_cache_limit >= i + sizeof(struct pargs)) newargs = pargs_alloc(i); /* close files on exec */ fdcloseexec(td); /* Get a reference to the vnode prior to locking the proc */ VREF(ndp->ni_vp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup(p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ oldcred = p->p_ucred; credential_changing = 0; credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_execve_will_transition(oldcred, imgp->vp, interplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracevp != NULL && suser_cred(oldcred, PRISON_ROOT)) { mtx_lock(&ktrace_mtx); p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); } #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); setugidsafety(td); error = fdcheckstd(td); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ crcopy(newcred, oldcred); if (attr.va_mode & VSUID) change_euid(newcred, euip); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_execve_transition(oldcred, newcred, imgp->vp, interplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { crcopy(newcred, oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Free any previous argument cache */ oldargs = p->p_args; p->p_args = NULL; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { bcopy(imgp->stringbase, newargs->ar_args, i); p->p_args = newargs; newargs = NULL; } PROC_UNLOCK(p); /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); else exec_setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) vrele(textvp); if (ndp->ni_vp && error != 0) vrele(ndp->ni_vp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif if (oldargs != NULL) pargs_drop(oldargs); if (newargs != NULL) pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(imgp->vp); } if (imgp->stringbase != NULL) kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); if (imgp->object != NULL) vm_object_deallocate(imgp->object); if (error == 0) { /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ #ifdef MAC mac_execve_exit(imgp); if (interplabel != NULL) mac_vnode_label_free(interplabel); #endif exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ error = 0; } done2: #ifdef MAC mac_execve_exit(imgp); if (interplabel != NULL) mac_vnode_label_free(interplabel); #endif mtx_unlock(&Giant); return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; GIANT_REQUIRED; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); VOP_GETVOBJECT(imgp->vp, &object); VM_OBJECT_LOCK(object); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { if (ma[i]->valid) break; vm_page_lock_queues(); if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) { vm_page_unlock_queues(); break; } vm_page_busy(ma[i]); vm_page_unlock_queues(); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_lock_queues(); pmap_remove_all(ma[0]); vm_page_free(ma[0]); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(object); return (EIO); } } vm_page_lock_queues(); vm_page_hold(ma[0]); vm_page_wakeup(ma[0]); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock_queues(); vm_page_unhold(m); vm_page_unlock_queues(); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_offset_t stack_addr; vm_map_t map; GIANT_REQUIRED; imgp->vmspace_destroyed = 1; /* Called with Giant held, do not depend on it! */ EVENTHANDLER_INVOKE(process_exec, p); /* * Here is as good a place as any to do any resource limit cleanups. * This is needed if a 64 bit binary exec's a 32 bit binary - the * data size limit may need to be changed to a value that makes * sense for the 32 bit binary. */ if (sv->sv_fixlimits != NULL) sv->sv_fixlimits(imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); - vm_page_lock_queues(); pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map), vm_map_max(map)); - vm_page_unlock_queues(); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ stack_addr = sv->sv_usrstack - maxssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ stack_addr = IA64_BACKINGSTORE; error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz; return (0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error; size_t length; /* * extract arguments first */ argv = imgp->userspace_argv; if (argv) { argp = (caddr_t)(intptr_t)fuword(argv); if (argp == (caddr_t)-1) return (EFAULT); if (argp) argv++; if (imgp->argv0) argp = imgp->argv0; if (argp) { do { if (argp == (caddr_t)-1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return (E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } while ((argp = (caddr_t)(intptr_t)fuword(argv++))); } } else return (EFAULT); imgp->endargs = imgp->stringp; /* * extract environment strings */ envv = imgp->userspace_envv; if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return (E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) * sizeof(char *)); /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* XXXKSE */ /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred, td); if (error) return (error); #ifdef MAC error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1); return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 132081) +++ head/sys/kern/kern_exit.c (revision 132082) @@ -1,765 +1,763 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include /* Required to be non-static for SysVR4 emulator */ MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); /* * exit -- * Death of process. * * MPSAFE */ void sys_exit(struct thread *td, struct sys_exit_args *uap) { exit1(td, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct bintime new_switchtime; struct proc *p, *nq, *q; struct tty *tp; struct vnode *ttyvp; struct vmspace *vm; struct vnode *vtmp; #ifdef KTRACE struct vnode *tracevp; struct ucred *tracecred; #endif struct plimit *plim; /* * Drop Giant if caller has it. Eventually we should warn about * being called with Giant held. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); p = td->td_proc; if (p == initproc) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); if (p->p_flag & P_SA || p->p_numthreads > 1) { retry: /* * First check if some other thread got here before us.. * if so, act apropriatly, (exit or suspend); */ thread_suspend_check(0); /* * Kill off the other threads. This requires * Some co-operation from other parts of the kernel * so it may not be instant. * With this state set: * Any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediatly with EINTR or EWOULDBLOCK, * which will hopefully force them to back out to userland, * freeing resources as they go, and anything attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last other * thread exits. * If there is already a thread singler after resumption, * calling thread_single will fail, in the case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (thread_single(SINGLE_EXIT)) goto retry; /* * All other activity in this process is now stopped. * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them) * ... * Turn off threading support. */ p->p_flag &= ~P_SA; td->td_pflags &= ~TDP_SA; thread_single_end(); /* Don't need this any more. */ } p->p_flag |= P_WEXIT; PROC_UNLOCK(p); /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } PROC_LOCK(p); _STOPEVENT(p, S_EXIT, rv); wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */ PROC_UNLOCK(p); /* * Check if any loadable modules need anything done at process exit. * e.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), M_ZOMBIE, M_WAITOK); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT); SIGEMPTYSET(p->p_siglist); SIGEMPTYSET(td->td_siglist); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ mtx_lock(&Giant); /* XXX: not sure if needed */ funsetownlst(&p->p_sigiolst); /* * Close open files and release open-file table. * This may block! */ fdfree(td); mtx_unlock(&Giant); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); mtx_lock(&Giant); /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * Can't free the entire vmspace as the kernel stack * may be mapped within that space also. * * Processes sharing the same vmspace may exit in one order, and * get cleaned up by vmspace_exit() in a different order. The * last exiting process to reach this point releases as much of * the environment as it can, and the last process cleaned up * by vmspace_exit() (which decrements exitingcnt) cleans up the * remainder. */ ++vm->vm_exitingcnt; if (--vm->vm_refcnt == 0) { shmexit(vm); - vm_page_lock_queues(); pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); - vm_page_unlock_queues(); (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); } sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp; sp = p->p_session; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { tp = sp->s_ttyp; if (sp->s_ttyp->t_pgrp) { PGRP_LOCK(sp->s_ttyp->t_pgrp); pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); PGRP_UNLOCK(sp->s_ttyp->t_pgrp); } /* XXX tp should be locked. */ sx_xunlock(&proctree_lock); (void) ttywait(tp); sx_xlock(&proctree_lock); /* * The tty could have been revoked * if we blocked. */ if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); sx_xunlock(&proctree_lock); VOP_REVOKE(ttyvp, REVOKEALL); vrele(ttyvp); sx_xlock(&proctree_lock); } } if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); vrele(ttyvp); } /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } SESS_LOCK(p->p_session); sp->s_leader = NULL; SESS_UNLOCK(p->p_session); } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); mtx_unlock(&Giant); #ifdef KTRACE /* * release trace file */ PROC_LOCK(p); mtx_lock(&ktrace_mtx); p->p_traceflag = 0; /* don't trace the vrele() */ tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) { mtx_lock(&Giant); vrele(tracevp); mtx_unlock(&Giant); } if (tracecred != NULL) crfree(tracecred); #endif /* * Release reference to text vnode */ if ((vtmp = p->p_textvp) != NULL) { p->p_textvp = NULL; mtx_lock(&Giant); vrele(vtmp); mtx_unlock(&Giant); } /* * Release our limits structure. */ PROC_LOCK(p); plim = p->p_limit; p->p_limit = NULL; PROC_UNLOCK(p); lim_free(plim); /* * Release this thread's reference to the ucred. The actual proc * reference will stay around until the proc is harvested by * wait(). At this point the ucred is immutable (no other threads * from this proc are around that can change it) so we leave the * per-thread ucred pointer intact in case it is needed although * in theory nothing should be using it at this point. */ crfree(td->td_ucred); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(initproc); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); proc_reparent(q, initproc); q->p_sigparent = SIGCHLD; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Save exit status and final rusage info, adding in child rusage * info and self times. */ mtx_lock(&Giant); PROC_LOCK(p); p->p_xstat = rv; p->p_xlwpid = td->td_tid; *p->p_ru = p->p_stats->p_ru; mtx_lock_spin(&sched_lock); calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); mtx_unlock_spin(&sched_lock); ruadd(p->p_ru, &p->p_stats->p_cru); /* * Notify interested parties of our demise. */ KNOTE(&p->p_klist, NOTE_EXIT); mtx_unlock(&Giant); /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ while (SLIST_FIRST(&p->p_klist)) SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext); /* * Notify parent that we're gone. If parent has the PS_NOCLDWAIT * flag set, or if the handler is set to SIG_IGN, notify process * 1 instead (and hope it will handle this situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, initproc); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * If this was the last child of our parent, notify * parent, so in case he was wait(2)ing, he will * continue. */ if (LIST_EMPTY(&pp->p_children)) wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == initproc) psignal(p->p_pptr, SIGCHLD); else if (p->p_sigparent != 0) psignal(p->p_pptr, p->p_sigparent); PROC_UNLOCK(p->p_pptr); /* * If this is a kthread, then wakeup anyone waiting for it to exit. */ if (p->p_flag & P_KTHREAD) wakeup(p); PROC_UNLOCK(p); /* * Finally, call machine-dependent code to release the remaining * resources including address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); PROC_LOCK(p); PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); while (mtx_owned(&Giant)) mtx_unlock(&Giant); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid any possibly context * switches while marked as a zombie including blocking on * a mutex. */ mtx_lock_spin(&sched_lock); p->p_state = PRS_ZOMBIE; critical_enter(); mtx_unlock_spin(&sched_lock); wakeup(p->p_pptr); PROC_UNLOCK(p->p_pptr); mtx_lock_spin(&sched_lock); critical_exit(); /* Do the same timestamp bookkeeping that mi_switch() would do. */ binuptime(&new_switchtime); bintime_add(&p->p_runtime, &new_switchtime); bintime_sub(&p->p_runtime, PCPU_PTR(switchtime)); PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); cnt.v_swtch++; sched_exit(p->p_pptr, p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifdef COMPAT_43 /* * MPSAFE. The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * MPSAFE. The dirty work is handled by kern_wait(). */ int wait4(struct thread *td, struct wait_args *uap) { struct rusage ru; int error, status; error = kern_wait(td, uap->pid, &status, uap->options, &ru); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { int nfound; struct proc *p, *q, *t; int error; q = td->td_proc; if (pid == 0) { PROC_LOCK(q); pid = -q->p_pgid; PROC_UNLOCK(q); } if (options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE)) return (EINVAL); loop: nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { PROC_LOCK(p); if (pid != WAIT_ANY && p->p_pid != pid && p->p_pgid != -pid) { PROC_UNLOCK(p); continue; } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); continue; } nfound++; if (p->p_state == PRS_ZOMBIE) { td->td_retval[0] = p->p_pid; if (status) *status = p->p_xstat; /* convert to int */ if (rusage) bcopy(p->p_ru, rusage, sizeof(struct rusage)); /* * If we got the child via a ptrace 'attach', * we need to give it back to the old parent. */ PROC_UNLOCK(p); if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { PROC_LOCK(p); p->p_oppid = 0; proc_reparent(p, t); PROC_UNLOCK(p); psignal(t, SIGCHLD); wakeup(t); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return (0); } /* * Remove other references to this process to ensure * we have an exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); leavepgrp(p); sx_xunlock(&proctree_lock); /* * As a side effect of this lock, we know that * all other writes to this proc are visible now, so * no more locking is needed for p. */ mtx_lock(&Giant); PROC_LOCK(p); p->p_xstat = 0; /* XXX: why? */ PROC_UNLOCK(p); PROC_LOCK(q); ruadd(&q->p_stats->p_cru, p->p_ru); PROC_UNLOCK(q); FREE(p->p_ru, M_ZOMBIE); p->p_ru = NULL; mtx_unlock(&Giant); /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Free credentials, arguments, and sigacts */ crfree(p->p_ucred); p->p_ucred = NULL; pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * do any thread-system specific cleanups */ thread_wait(p); /* * Give vm and machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ mtx_lock(&Giant); vm_waitproc(p); mtx_unlock(&Giant); #ifdef MAC mac_destroy_proc(p); #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("kern_wait: no residual thread!")); uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; sx_xunlock(&allproc_lock); return (0); } mtx_lock_spin(&sched_lock); if (P_SHOULDSTOP(p) && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0) && (p->p_flag & P_TRACED || options & WUNTRACED)) { mtx_unlock_spin(&sched_lock); p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; if (status) *status = W_STOPCODE(p->p_xstat); PROC_UNLOCK(p); return (0); } mtx_unlock_spin(&sched_lock); if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; p->p_flag &= ~P_CONTINUED; PROC_UNLOCK(p); if (status) *status = SIGCONT; return (0); } PROC_UNLOCK(p); } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) return (error); goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; }