Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 6806) +++ head/sys/amd64/amd64/pmap.c (revision 6807) @@ -1,2121 +1,2124 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.49 1995/02/15 04:36:31 davidg Exp $ + * $Id: pmap.c,v 1.50 1995/02/26 05:14:16 bde Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include /* * Allocate various and sundry SYSMAPs used in the days of old VM * and not yet converted. XXX. */ #define BSDVM_COMPAT 1 /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) int protection_codes[8]; struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t phys_avail[6]; /* 2 entries + 1 null */ vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_size_t mem_size; /* memory size in bytes */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int i386pagesperpage; /* PAGE_SIZE / I386_PAGE_SIZE */ boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ vm_offset_t vm_first_phys, vm_last_phys; static inline int pmap_is_managed(); static inline void *vm_get_pmap(); static inline void vm_put_pmap(); static void i386_protection_init(); static void pmap_alloc_pv_entry(); static inline pv_entry_t get_pv_entry(); int nkpt; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #if BSDVM_COMPAT #include /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1, *CMAP2, *ptmmap; caddr_t CADDR1, CADDR2, ptvmmap; pt_entry_t *msgbufmap; struct msgbuf *msgbufp; #endif void init_pv_entries(int); /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ inline pt_entry_t * const pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return ((pt_entry_t *) avtopte(va)); } } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t pa; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } pa = *(int *) avtopte(va); } return ((pa & PG_FRAME) | (va & ~PG_FRAME)); } return 0; } /* * determine if a page is managed (memory vs. device) */ static inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ __inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { vm_page_t m; pt = i386_trunc_page(pt); pt = (pt - UPT_MIN_ADDRESS) / NBPG; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; m = PHYS_TO_VM_PAGE(pt); return m; } /* * Wire a page table page */ __inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold(pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; vm_page_t m; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); m = pmap_pte_vm_page(pmap, pt); vm_page_unhold(m); if (pmap != kernel_pmap && (m->hold_count == 0) && (m->wire_count == 0) && (va < KPT_MIN_ADDRESS)) { pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); } } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * Map the kernel's code and data, and allocate the system page table. * * On the I386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define DMAPAGES 8 void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { #if BSDVM_COMPAT vm_offset_t va; pt_entry_t *pte; #endif avail_start = firstaddr + DMAPAGES * NBPG; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_end = VM_MAX_KERNEL_ADDRESS; i386pagesperpage = PAGE_SIZE / NBPG; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence. */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); simple_lock_init(&kernel_pmap->pm_lock); kernel_pmap->pm_count = 1; nkpt = NKPT; #if BSDVM_COMPAT /* * Allocate all the submaps we need */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) SYSMAP(caddr_t, ptmmap, ptvmmap, 1) SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) virtual_avail = va; #endif /* * Reserve special hunk of memory for use by bus dma as a bounce * buffer (contiguous virtual *and* physical memory). */ { extern vm_offset_t isaphysmem; isaphysmem = va; virtual_avail = pmap_map(va, firstaddr, firstaddr + DMAPAGES * NBPG, VM_PROT_ALL); } *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * Now that kernel map has been allocated, we can mark as unavailable * regions which we have mapped in locore. */ addr = atdevbase; (void) vm_map_find(kernel_map, NULL, (vm_offset_t) 0, &addr, (0x100000 - 0xa0000), FALSE); addr = (vm_offset_t) KERNBASE + IdlePTD; vm_object_reference(kernel_object); (void) vm_map_find(kernel_map, kernel_object, addr, &addr, (4 + NKPDE) * NBPG, FALSE); /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); s = i386_round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /* * Create and return a physical map. * * If the size specified for the map * is zero, the map is an actual physical * map, and may be referenced by the * hardware. * * If the size specified is non-zero, * the map will be used in software only, and * is bounded by that size. * * [ just allocate a ptd and mark it uninitialize -- should we track * with a table which process has which ptd? -wfj ] */ pmap_t pmap_create(size) vm_size_t size; { register pmap_t pmap; /* * Software use map does not need a pmap */ if (size) return (NULL); pmap = (pmap_t) malloc(sizeof *pmap, M_VMPMAP, M_WAITOK); bzero(pmap, sizeof(*pmap)); pmap_pinit(pmap); return (pmap); } struct pmaplist { struct pmaplist *next; }; static inline void * vm_get_pmap() { struct pmaplist *rtval; rtval = (struct pmaplist *) kmem_alloc(kernel_map, ctob(1)); bzero(rtval, ctob(1)); return rtval; } static inline void vm_put_pmap(up) struct pmaplist *up; { kmem_free(kernel_map, (vm_offset_t) up, ctob(1)); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a valid * page directory table. */ pmap->pm_pdir = (pd_entry_t *) vm_get_pmap(); /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(int *) (pmap->pm_pdir + PTDPTDI) = ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; simple_lock_init(&pmap->pm_lock); } /* * grow the number of kernel page table entries, if needed */ vm_page_t nkpg; vm_offset_t kernel_vm_end; void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); ++nkpt; } } addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW); nkpg = NULL; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; simple_lock(&pmap->pm_lock); count = --pmap->pm_count; simple_unlock(&pmap->pm_lock); if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_put_pmap((struct pmaplist *) pmap->pm_pdir); } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { simple_lock(&pmap->pm_lock); pmap->pm_count++; simple_unlock(&pmap->pm_lock); } } #define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ int pv_freelistcnt; pv_entry_t pv_freelist; vm_offset_t pvva; int npvvapg; /* * free the pv_entry back to the free list */ inline static void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, pvva - vm_map_min(kernel_map), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; newentries = (NBPG / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += NBPG; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG; pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { return PTmap; } /* otherwise, we are alternate address space */ if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return APTmap; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { for (npv = pv->pv_next; npv; npv = npv->pv_next) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp, *ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; vm_page_t m; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + NBPG) == eva) { if (*pmap_pde(pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if (!*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & PG_M) { if ((sva < USRSTACK || sva >= KERNBASE) || (sva >= USRSTACK && sva < USRSTACK + (UPAGES * NBPG))) { if (sva < clean_sva || sva >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); } pmap_unuse_pt(pmap, sva); pmap_update(); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (*pmap_pde(pmap, i386_ptob(sva)) == 0) { /* We can race ahead here, straight to next pde.. */ sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations that * are much faster than explicitly scanning when page tables * are not fully populated. */ if (*ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx"); if (!found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int) oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. XXX: should cluster them up and * invalidate as many as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we may be called * at interrupt time). */ pa = ((int) oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { pmap_unuse_pt(pmap, va); ++sva; continue; } if ((int) oldpte & PG_M) { if ((va < USRSTACK || va >= KERNBASE) || (va >= USRSTACK && va < USRSTACK + (UPAGES * NBPG))) { if (va < clean_sva || va >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } pmap_update(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) return; pa = i386_trunc_page(pa); pv = pa_to_pvh(pa); m = PHYS_TO_VM_PAGE(pa); s = splhigh(); while (pv->pv_pmap != NULL) { pmap = pv->pv_pmap; ptp = get_pt_entry(pmap); va = pv->pv_va; pte = ptp + i386_btop(va); if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if (*pte) { pmap->pm_stats.resident_count--; anyvalid++; /* * Update the vm_page_t clean and reference bits. */ if ((int) *pte & PG_M) { if ((va < USRSTACK || va >= KERNBASE) || (va >= USRSTACK && va < USRSTACK + (UPAGES * NBPG))) { if (va < clean_sva || va >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } *pte = 0; pmap_unuse_pt(pmap, va); } npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } splx(s); if (anyvalid) pmap_update(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found = 0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. Skip it, we don't want to * force allocation of unnecessary PTE pages just to set the * protection. */ if (!*pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t) - 1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if (*pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx"); if (!found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if (va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) pmap_update(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int ptevalid = 0; if (pmap == NULL) return; va = i386_trunc_page(va); pa = i386_trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if (*pmap_pde(pmap, va) == 0) { printf("kernel page directory invalid pdir=0x%x, va=0x%x\n", pmap->pm_pdir[PTDPTDI], va); panic("invalid kernel page directory"); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place * this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V)); /* * When forking (copy-on-write, etc): A process will turn off write * permissions for any of its writable pages. If the data (object) is * only referred to by one process, the processes map is modified * directly as opposed to using the object manipulation routine. When * using pmap_protect, the modified bits are not kept in the vm_page_t * data structure. Therefore, when using pmap_enter in vm_fault to * bring back writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. this clause * supports the carryover of the modified and used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M | PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if (*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) { pmap_update(); } else { pmap_use_pt(pmap, va); } } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va + i * NBPG); if (*pte) anyvalid++; *pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | PG_W)); } if (anyvalid) pmap_update(); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va + i * NBPG); *pte = 0; } pmap_update(); } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; int wasvalid = 0; pte = vtopte(va); if (*pte) wasvalid++; *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_W)); if (wasvalid) pmap_update(); } /* * remove a page from the kernel pagetables */ void pmap_kremove(va) vm_offset_t va; { register pt_entry_t *pte; pte = vtopte(va); *pte = (pt_entry_t) 0; pmap_update(); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static inline void pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pte = vtopte(va); /* a fault on the page table might occur here */ if (*pte) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_pmap = pmap; pv->pv_va = va; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place this entry * after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ((int) (pa | PG_V | PG_u)); pmap_use_pt(pmap, va); return; } #define MAX_INIT_PT (1024*2048) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, offset, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_offset_t offset; vm_offset_t size; { vm_offset_t tmpoff; vm_page_t p; int bits; int objbytes; if (!pmap || ((size > MAX_INIT_PT) && (object->resident_page_count > (MAX_INIT_PT / NBPG)))) { return; } if (!vm_object_lock_try(object)) return; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (size > (object->size >> 2)) { objbytes = size; for (p = object->memq.tqh_first; ((objbytes > 0) && (p != NULL)); p = p->listq.tqe_next) { tmpoff = p->offset; if (tmpoff < offset) { continue; } tmpoff -= offset; if (tmpoff >= size) { continue; } if (((p->flags & (PG_ACTIVE | PG_INACTIVE)) != 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->bmapped == 0) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) == 0) { vm_page_hold(p); + p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + tmpoff, VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } objbytes -= NBPG; } } else { /* * else lookup the pages one-by-one. */ for (tmpoff = 0; tmpoff < size; tmpoff += NBPG) { p = vm_page_lookup(object, tmpoff + offset); if (p && ((p->flags & (PG_ACTIVE | PG_INACTIVE)) != 0) && (p->bmapped == 0) && (p->busy == 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) == 0) { vm_page_hold(p); + p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + tmpoff, VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } } } vm_object_unlock(object); } #if 0 /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -NBPG, NBPG, -2 * NBPG, 2 * NBPG }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta, enda; vm_offset_t offset, addr; vm_page_t m; int pageorder_index; if (entry->object.vm_object != object) return; if (pmap != &curproc->p_vmspace->vm_pmap) return; starta = addra - PFBAK * NBPG; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) starta = 0; enda = addra + PFFOR * NBPG; if (enda > entry->end) enda = entry->end; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; pt_entry_t *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= enda) continue; pte = vtopte(addr); if (*pte) continue; offset = (addr - entry->start) + entry->offset; lobject = object; for (m = vm_page_lookup(lobject, offset); (!m && lobject->shadow && !lobject->pager); lobject = lobject->shadow) { offset += lobject->shadow_offset; m = vm_page_lookup(lobject->shadow, offset); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->flags & (PG_CACHE | PG_ACTIVE | PG_INACTIVE)) != 0) && ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->bmapped == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { /* * test results show that the system is faster when * pages are activated. */ if ((m->flags & PG_ACTIVE) == 0) { if( m->flags & PG_CACHE) vm_page_deactivate(m); else vm_page_activate(m); } vm_page_hold(m); + m->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); vm_page_unhold(m); } } } #endif /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); /* * When unwiring, set the modified bit in the pte -- could have been * changed by the kernel */ if (!wired) (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); bzero(CADDR2, NBPG); *(int *) CMAP2 = 0; pmap_update(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src); *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); #if __GNUC__ > 1 memcpy(CADDR2, CADDR1, NBPG); #else bcopy(CADDR1, CADDR2, NBPG); #endif *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; pmap_update(); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ __inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark UPAGES as always modified, and ptes as never * modified. */ if (bit & PG_U) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } if (bit & PG_M) { if (pv->pv_va >= USRSTACK) { if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { continue; } if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) { splx(s); return TRUE; } else if (pv->pv_va < KERNBASE) { splx(s); return FALSE; } } } if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ __inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", va); continue; } pte = pmap_pte(pv->pv_pmap, va); if (setem) (int) npte = (int) *pte | bit; else (int) npte = (int) *pte & ~bit; *pte = npte; } } splx(s); pmap_update(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * by any physical maps. */ boolean_t pmap_is_referenced(vm_offset_t pa) { return pmap_testbit((pa), PG_U); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_U, FALSE); } /* * Routine: pmap_copy_on_write * Function: * Remove write privileges from all * physical maps for this physical page. */ void pmap_copy_on_write(vm_offset_t pa) { pmap_changebit((pa), PG_RW, FALSE); } /* * Miscellaneous support routines follow */ void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; pt_entry_t *pte; pa = trunc_page(pa); size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { pte = vtopte(tmpva); *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N)); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } #ifdef DEBUG /* print address space of pmap*/ void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PD_SHIFT) + (j << PG_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 6806) +++ head/sys/i386/i386/pmap.c (revision 6807) @@ -1,2121 +1,2124 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.49 1995/02/15 04:36:31 davidg Exp $ + * $Id: pmap.c,v 1.50 1995/02/26 05:14:16 bde Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include /* * Allocate various and sundry SYSMAPs used in the days of old VM * and not yet converted. XXX. */ #define BSDVM_COMPAT 1 /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) int protection_codes[8]; struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t phys_avail[6]; /* 2 entries + 1 null */ vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_size_t mem_size; /* memory size in bytes */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int i386pagesperpage; /* PAGE_SIZE / I386_PAGE_SIZE */ boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ vm_offset_t vm_first_phys, vm_last_phys; static inline int pmap_is_managed(); static inline void *vm_get_pmap(); static inline void vm_put_pmap(); static void i386_protection_init(); static void pmap_alloc_pv_entry(); static inline pv_entry_t get_pv_entry(); int nkpt; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #if BSDVM_COMPAT #include /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1, *CMAP2, *ptmmap; caddr_t CADDR1, CADDR2, ptvmmap; pt_entry_t *msgbufmap; struct msgbuf *msgbufp; #endif void init_pv_entries(int); /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ inline pt_entry_t * const pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return ((pt_entry_t *) avtopte(va)); } } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t pa; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } pa = *(int *) avtopte(va); } return ((pa & PG_FRAME) | (va & ~PG_FRAME)); } return 0; } /* * determine if a page is managed (memory vs. device) */ static inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ __inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { vm_page_t m; pt = i386_trunc_page(pt); pt = (pt - UPT_MIN_ADDRESS) / NBPG; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; m = PHYS_TO_VM_PAGE(pt); return m; } /* * Wire a page table page */ __inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold(pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; vm_page_t m; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); m = pmap_pte_vm_page(pmap, pt); vm_page_unhold(m); if (pmap != kernel_pmap && (m->hold_count == 0) && (m->wire_count == 0) && (va < KPT_MIN_ADDRESS)) { pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); } } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * Map the kernel's code and data, and allocate the system page table. * * On the I386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define DMAPAGES 8 void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { #if BSDVM_COMPAT vm_offset_t va; pt_entry_t *pte; #endif avail_start = firstaddr + DMAPAGES * NBPG; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_end = VM_MAX_KERNEL_ADDRESS; i386pagesperpage = PAGE_SIZE / NBPG; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence. */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); simple_lock_init(&kernel_pmap->pm_lock); kernel_pmap->pm_count = 1; nkpt = NKPT; #if BSDVM_COMPAT /* * Allocate all the submaps we need */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) SYSMAP(caddr_t, ptmmap, ptvmmap, 1) SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) virtual_avail = va; #endif /* * Reserve special hunk of memory for use by bus dma as a bounce * buffer (contiguous virtual *and* physical memory). */ { extern vm_offset_t isaphysmem; isaphysmem = va; virtual_avail = pmap_map(va, firstaddr, firstaddr + DMAPAGES * NBPG, VM_PROT_ALL); } *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * Now that kernel map has been allocated, we can mark as unavailable * regions which we have mapped in locore. */ addr = atdevbase; (void) vm_map_find(kernel_map, NULL, (vm_offset_t) 0, &addr, (0x100000 - 0xa0000), FALSE); addr = (vm_offset_t) KERNBASE + IdlePTD; vm_object_reference(kernel_object); (void) vm_map_find(kernel_map, kernel_object, addr, &addr, (4 + NKPDE) * NBPG, FALSE); /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); s = i386_round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } /* * Create and return a physical map. * * If the size specified for the map * is zero, the map is an actual physical * map, and may be referenced by the * hardware. * * If the size specified is non-zero, * the map will be used in software only, and * is bounded by that size. * * [ just allocate a ptd and mark it uninitialize -- should we track * with a table which process has which ptd? -wfj ] */ pmap_t pmap_create(size) vm_size_t size; { register pmap_t pmap; /* * Software use map does not need a pmap */ if (size) return (NULL); pmap = (pmap_t) malloc(sizeof *pmap, M_VMPMAP, M_WAITOK); bzero(pmap, sizeof(*pmap)); pmap_pinit(pmap); return (pmap); } struct pmaplist { struct pmaplist *next; }; static inline void * vm_get_pmap() { struct pmaplist *rtval; rtval = (struct pmaplist *) kmem_alloc(kernel_map, ctob(1)); bzero(rtval, ctob(1)); return rtval; } static inline void vm_put_pmap(up) struct pmaplist *up; { kmem_free(kernel_map, (vm_offset_t) up, ctob(1)); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a valid * page directory table. */ pmap->pm_pdir = (pd_entry_t *) vm_get_pmap(); /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(int *) (pmap->pm_pdir + PTDPTDI) = ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; simple_lock_init(&pmap->pm_lock); } /* * grow the number of kernel page table entries, if needed */ vm_page_t nkpg; vm_offset_t kernel_vm_end; void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); ++nkpt; } } addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW); nkpg = NULL; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; simple_lock(&pmap->pm_lock); count = --pmap->pm_count; simple_unlock(&pmap->pm_lock); if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_put_pmap((struct pmaplist *) pmap->pm_pdir); } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { simple_lock(&pmap->pm_lock); pmap->pm_count++; simple_unlock(&pmap->pm_lock); } } #define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ int pv_freelistcnt; pv_entry_t pv_freelist; vm_offset_t pvva; int npvvapg; /* * free the pv_entry back to the free list */ inline static void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, pvva - vm_map_min(kernel_map), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; newentries = (NBPG / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += NBPG; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG; pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { return PTmap; } /* otherwise, we are alternate address space */ if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return APTmap; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { for (npv = pv->pv_next; npv; npv = npv->pv_next) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp, *ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; vm_page_t m; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + NBPG) == eva) { if (*pmap_pde(pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if (!*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & PG_M) { if ((sva < USRSTACK || sva >= KERNBASE) || (sva >= USRSTACK && sva < USRSTACK + (UPAGES * NBPG))) { if (sva < clean_sva || sva >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); } pmap_unuse_pt(pmap, sva); pmap_update(); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (*pmap_pde(pmap, i386_ptob(sva)) == 0) { /* We can race ahead here, straight to next pde.. */ sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations that * are much faster than explicitly scanning when page tables * are not fully populated. */ if (*ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx"); if (!found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int) oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. XXX: should cluster them up and * invalidate as many as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we may be called * at interrupt time). */ pa = ((int) oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { pmap_unuse_pt(pmap, va); ++sva; continue; } if ((int) oldpte & PG_M) { if ((va < USRSTACK || va >= KERNBASE) || (va >= USRSTACK && va < USRSTACK + (UPAGES * NBPG))) { if (va < clean_sva || va >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } pmap_update(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) return; pa = i386_trunc_page(pa); pv = pa_to_pvh(pa); m = PHYS_TO_VM_PAGE(pa); s = splhigh(); while (pv->pv_pmap != NULL) { pmap = pv->pv_pmap; ptp = get_pt_entry(pmap); va = pv->pv_va; pte = ptp + i386_btop(va); if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if (*pte) { pmap->pm_stats.resident_count--; anyvalid++; /* * Update the vm_page_t clean and reference bits. */ if ((int) *pte & PG_M) { if ((va < USRSTACK || va >= KERNBASE) || (va >= USRSTACK && va < USRSTACK + (UPAGES * NBPG))) { if (va < clean_sva || va >= clean_eva) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } } *pte = 0; pmap_unuse_pt(pmap, va); } npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } splx(s); if (anyvalid) pmap_update(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found = 0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. Skip it, we don't want to * force allocation of unnecessary PTE pages just to set the * protection. */ if (!*pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t) - 1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if (*pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx"); if (!found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if (va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) pmap_update(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int ptevalid = 0; if (pmap == NULL) return; va = i386_trunc_page(va); pa = i386_trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if (*pmap_pde(pmap, va) == 0) { printf("kernel page directory invalid pdir=0x%x, va=0x%x\n", pmap->pm_pdir[PTDPTDI], va); panic("invalid kernel page directory"); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place * this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V)); /* * When forking (copy-on-write, etc): A process will turn off write * permissions for any of its writable pages. If the data (object) is * only referred to by one process, the processes map is modified * directly as opposed to using the object manipulation routine. When * using pmap_protect, the modified bits are not kept in the vm_page_t * data structure. Therefore, when using pmap_enter in vm_fault to * bring back writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. this clause * supports the carryover of the modified and used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M | PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if (*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) { pmap_update(); } else { pmap_use_pt(pmap, va); } } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va + i * NBPG); if (*pte) anyvalid++; *pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V | PG_W)); } if (anyvalid) pmap_update(); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { pte = vtopte(va + i * NBPG); *pte = 0; } pmap_update(); } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; int wasvalid = 0; pte = vtopte(va); if (*pte) wasvalid++; *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_W)); if (wasvalid) pmap_update(); } /* * remove a page from the kernel pagetables */ void pmap_kremove(va) vm_offset_t va; { register pt_entry_t *pte; pte = vtopte(va); *pte = (pt_entry_t) 0; pmap_update(); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static inline void pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pte = vtopte(va); /* a fault on the page table might occur here */ if (*pte) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_pmap = pmap; pv->pv_va = va; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place this entry * after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ((int) (pa | PG_V | PG_u)); pmap_use_pt(pmap, va); return; } #define MAX_INIT_PT (1024*2048) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, offset, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_offset_t offset; vm_offset_t size; { vm_offset_t tmpoff; vm_page_t p; int bits; int objbytes; if (!pmap || ((size > MAX_INIT_PT) && (object->resident_page_count > (MAX_INIT_PT / NBPG)))) { return; } if (!vm_object_lock_try(object)) return; /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (size > (object->size >> 2)) { objbytes = size; for (p = object->memq.tqh_first; ((objbytes > 0) && (p != NULL)); p = p->listq.tqe_next) { tmpoff = p->offset; if (tmpoff < offset) { continue; } tmpoff -= offset; if (tmpoff >= size) { continue; } if (((p->flags & (PG_ACTIVE | PG_INACTIVE)) != 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->bmapped == 0) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) == 0) { vm_page_hold(p); + p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + tmpoff, VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } objbytes -= NBPG; } } else { /* * else lookup the pages one-by-one. */ for (tmpoff = 0; tmpoff < size; tmpoff += NBPG) { p = vm_page_lookup(object, tmpoff + offset); if (p && ((p->flags & (PG_ACTIVE | PG_INACTIVE)) != 0) && (p->bmapped == 0) && (p->busy == 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) == 0) { vm_page_hold(p); + p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + tmpoff, VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } } } vm_object_unlock(object); } #if 0 /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 2 #define PFFOR 2 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -NBPG, NBPG, -2 * NBPG, 2 * NBPG }; void pmap_prefault(pmap, addra, entry, object) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; vm_object_t object; { int i; vm_offset_t starta, enda; vm_offset_t offset, addr; vm_page_t m; int pageorder_index; if (entry->object.vm_object != object) return; if (pmap != &curproc->p_vmspace->vm_pmap) return; starta = addra - PFBAK * NBPG; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) starta = 0; enda = addra + PFFOR * NBPG; if (enda > entry->end) enda = entry->end; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; pt_entry_t *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr < starta || addr >= enda) continue; pte = vtopte(addr); if (*pte) continue; offset = (addr - entry->start) + entry->offset; lobject = object; for (m = vm_page_lookup(lobject, offset); (!m && lobject->shadow && !lobject->pager); lobject = lobject->shadow) { offset += lobject->shadow_offset; m = vm_page_lookup(lobject->shadow, offset); } /* * give-up when a page is not in memory */ if (m == NULL) break; if (((m->flags & (PG_CACHE | PG_ACTIVE | PG_INACTIVE)) != 0) && ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->bmapped == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { /* * test results show that the system is faster when * pages are activated. */ if ((m->flags & PG_ACTIVE) == 0) { if( m->flags & PG_CACHE) vm_page_deactivate(m); else vm_page_activate(m); } vm_page_hold(m); + m->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); vm_page_unhold(m); } } } #endif /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); /* * When unwiring, set the modified bit in the pte -- could have been * changed by the kernel */ if (!wired) (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); bzero(CADDR2, NBPG); *(int *) CMAP2 = 0; pmap_update(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src); *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); #if __GNUC__ > 1 memcpy(CADDR2, CADDR1, NBPG); #else bcopy(CADDR1, CADDR2, NBPG); #endif *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; pmap_update(); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ __inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark UPAGES as always modified, and ptes as never * modified. */ if (bit & PG_U) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } if (bit & PG_M) { if (pv->pv_va >= USRSTACK) { if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { continue; } if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) { splx(s); return TRUE; } else if (pv->pv_va < KERNBASE) { splx(s); return FALSE; } } } if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ __inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", va); continue; } pte = pmap_pte(pv->pv_pmap, va); if (setem) (int) npte = (int) *pte | bit; else (int) npte = (int) *pte & ~bit; *pte = npte; } } splx(s); pmap_update(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * by any physical maps. */ boolean_t pmap_is_referenced(vm_offset_t pa) { return pmap_testbit((pa), PG_U); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_U, FALSE); } /* * Routine: pmap_copy_on_write * Function: * Remove write privileges from all * physical maps for this physical page. */ void pmap_copy_on_write(vm_offset_t pa) { pmap_changebit((pa), PG_RW, FALSE); } /* * Miscellaneous support routines follow */ void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; pt_entry_t *pte; pa = trunc_page(pa); size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { pte = vtopte(tmpva); *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N)); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } #ifdef DEBUG /* print address space of pmap*/ void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PD_SHIFT) + (j << PG_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 6806) +++ head/sys/kern/vfs_bio.c (revision 6807) @@ -1,1462 +1,1465 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. This work was done expressly for inclusion into FreeBSD. Other use * is allowed if this notation is included. * 5. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: vfs_bio.c,v 1.30 1995/02/22 09:30:13 davidg Exp $ + * $Id: vfs_bio.c,v 1.31 1995/02/25 01:46:26 davidg Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. */ #define VMIO #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct buf *buf; /* buffer header pool */ int nbuf; /* number of buffer headers calculated * elsewhere */ struct swqueue bswlist; int nvmio, nlru; extern vm_map_t buffer_map, io_map, kernel_map, pager_map; void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); void vfs_dirty_pages(struct buf * bp); void vfs_busy_pages(struct buf *, int clear_modify); int needsbuffer; /* * Internal update daemon, process 3 * The variable vfs_update_wakeup allows for internal syncs. */ int vfs_update_wakeup; /* * buffers base kva */ caddr_t buffers_kva; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; vm_offset_t bogus_offset; int bufspace, maxbufspace; /* * advisory minimum for size of LRU queue or VMIO queue */ int minbuf; /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); /* first, make a null hash table */ for (i = 0; i < BUFHSZ; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; bp->b_data = buffers_kva + i * MAXBSIZE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * this will change later!!! */ minbuf = nbuf / 3; maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); } /* * remove the buffer from the appropriate free list */ void bremfree(struct buf * bp) { int s = splbio(); if (bp->b_qindex != QUEUE_NONE) { if (bp->b_qindex == QUEUE_LRU) --nlru; TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { panic("bremfree: removing a buffer when not on a queue"); } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) /* count block I/O */ curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); return (biowait(bp)); } else if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0, &bp->b_blkno, (int *) 0); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) /* count block I/O */ curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); ++readwait; } else if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0, &bp->b_blkno, (int *) 0); } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); VOP_STRATEGY(rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async.) */ int bwrite(struct buf * bp) { int oldflags = bp->b_flags; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); bp->b_flags |= B_WRITEINPROG; if (oldflags & B_ASYNC) { if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } else if (curproc) { ++curproc->p_stats->p_ru.ru_oublock; } } bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); VOP_STRATEGY(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } else if (curproc) { ++curproc->p_stats->p_ru.ru_oublock; } brelse(bp); return (rtval); } return (0); } int vn_bwrite(ap) struct vop_bwrite_args *ap; { return (bwrite(ap->a_bp)); } /* * Delayed write. (Buffer is marked dirty). */ void bdwrite(struct buf * bp) { if ((bp->b_flags & B_BUSY) == 0) { panic("bdwrite: buffer is not busy"); } if (bp->b_flags & B_INVAL) { brelse(bp); return; } if (bp->b_flags & B_TAPE) { bawrite(bp); return; } bp->b_flags &= ~B_READ; vfs_dirty_pages(bp); if ((bp->b_flags & B_DELWRI) == 0) { if (curproc) ++curproc->p_stats->p_ru.ru_oublock; bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); } brelse(bp); return; } /* * Asynchronous write. * Start output on a buffer, but do not wait for it to complete. * The buffer is released when the output completes. */ void bawrite(struct buf * bp) { struct vnode *vp; vp = bp->b_vp; bp->b_flags |= B_ASYNC; (void) bwrite(bp); /* * this code supports limits on the amount of outstanding * writes to a disk file. this helps keep from overwhelming * the buffer cache with writes, thereby allowing other files * to be operated upon. */ if (vp->v_numoutput > (nbuf/2)) { int s = splbio(); while (vp->v_numoutput > (nbuf/4)) { vp->v_flag |= VBWAIT; tsleep((caddr_t) &vp->v_numoutput, PRIBIO, "bawnmo", 0); } splx(s); } } /* * Release a buffer. */ void brelse(struct buf * bp) { int s; if (bp->b_flags & B_CLUSTER) { relpbuf(bp); return; } /* anyone need a "free" block? */ s = splbio(); if (needsbuffer) { needsbuffer = 0; wakeup((caddr_t) &needsbuffer); } /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_AGE); wakeup((caddr_t) bp); } else if (bp->b_flags & B_VMIO) { bp->b_flags &= ~(B_WANTED | B_PDWANTED); wakeup((caddr_t) bp); } if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || (bp->b_bufsize <= 0)) { bp->b_flags |= B_INVAL; bp->b_flags &= ~(B_DELWRI | B_CACHE); if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) brelvp(bp); } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, so the B_INVAL flag is used to *invalidate* the buffer, * but the VM object is kept around. The B_NOCACHE flag is used to * invalidate the pages in the VM object. */ if (bp->b_flags & B_VMIO) { vm_offset_t foff; vm_object_t obj; int i, resid; vm_page_t m; int iototal = bp->b_bufsize; foff = 0; obj = 0; if (bp->b_npages) { if (bp->b_vp && bp->b_vp->v_mount) { foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; } else { /* * vnode pointer has been ripped away -- * probably file gone... */ foff = bp->b_pages[0]->offset; } } for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { panic("brelse: bogus page found"); } resid = (m->offset + PAGE_SIZE) - foff; if (resid > iototal) resid = iototal; if (resid > 0) { if (bp->b_flags & (B_ERROR | B_NOCACHE)) { vm_page_set_invalid(m, foff, resid); } else if ((bp->b_flags & B_DELWRI) == 0) { vm_page_set_clean(m, foff, resid); vm_page_set_valid(m, foff, resid); } } else { vm_page_test_dirty(m); } foff += resid; iototal -= resid; } if (bp->b_flags & B_INVAL) { for(i=0;ib_npages;i++) { m = bp->b_pages[i]; --m->bmapped; if (m->bmapped == 0) { PAGE_WAKEUP(m); if (m->valid == 0) { - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } else if ((m->dirty & m->valid) == 0 && (m->flags & PG_REFERENCED) == 0 && !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) vm_page_cache(m); else if( (m->flags & PG_ACTIVE) == 0) vm_page_activate(m); } } bufspace -= bp->b_bufsize; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); --nvmio; } } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE)) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_AGE) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); /* buffers with valid and quite potentially reuseable contents */ } else { if (bp->b_flags & B_VMIO) bp->b_qindex = QUEUE_VMIO; else { bp->b_qindex = QUEUE_LRU; ++nlru; } TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); } /* unlock */ bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE); splx(s); } /* * this routine implements clustered async writes for * clearing out B_DELWRI buffers... This is much better * than the old way of writing only one buffer at a time. */ void vfs_bio_awrite(struct buf * bp) { int i; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; s = splbio(); if( vp->v_mount && (vp->v_flag & VVMIO) && (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { int size = vp->v_mount->mnt_stat.f_iosize; for (i = 1; i < MAXPHYS / size; i++) { if ((bpa = incore(vp, lblkno + i)) && ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE)) break; } else { break; } } ncl = i; /* * this is a possible cluster write */ if (ncl != 1) { cluster_wbuild(vp, NULL, size, lblkno, ncl, -1); splx(s); return; } } /* * default (old) behavior, writing out only one block */ bremfree(bp); bp->b_flags |= B_BUSY | B_ASYNC; bwrite(bp); splx(s); } /* * Find a buffer header which is available for use. */ struct buf * getnewbuf(int slpflag, int slptimeo, int doingvmio) { struct buf *bp; int s; int firstbp = 1; s = splbio(); start: if (bufspace >= maxbufspace) goto trytofreespace; /* can we constitute a new buffer? */ if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { if (bp->b_qindex != QUEUE_EMPTY) panic("getnewbuf: inconsistent EMPTY queue"); bremfree(bp); goto fillbuf; } trytofreespace: /* * We keep the file I/O from hogging metadata I/O * This is desirable because file data is cached in the * VM/Buffer cache even if a buffer is freed. */ if (bp = bufqueues[QUEUE_AGE].tqh_first) { if (bp->b_qindex != QUEUE_AGE) panic("getnewbuf: inconsistent AGE queue"); } else if ((nvmio > nbuf - minbuf) && (bp = bufqueues[QUEUE_VMIO].tqh_first)) { if (bp->b_qindex != QUEUE_VMIO) panic("getnewbuf: inconsistent VMIO queue"); } else if ((nlru > nbuf - minbuf) && (bp = bufqueues[QUEUE_LRU].tqh_first)) { if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue"); } if (!bp) { if (doingvmio) { if (bp = bufqueues[QUEUE_VMIO].tqh_first) { if (bp->b_qindex != QUEUE_VMIO) panic("getnewbuf: inconsistent VMIO queue"); } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue"); } } else { if (bp = bufqueues[QUEUE_LRU].tqh_first) { if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue"); } else if (bp = bufqueues[QUEUE_VMIO].tqh_first) { if (bp->b_qindex != QUEUE_VMIO) panic("getnewbuf: inconsistent VMIO queue"); } } } if (!bp) { /* wait for a free buffer of any kind */ needsbuffer = 1; tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo); splx(s); return (0); } /* if we are a delayed write, convert to an async write */ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { vfs_bio_awrite(bp); if (!slpflag && !slptimeo) { splx(s); return (0); } goto start; } bremfree(bp); if (bp->b_flags & B_VMIO) { bp->b_flags |= B_INVAL | B_BUSY; brelse(bp); bremfree(bp); } if (bp->b_vp) brelvp(bp); /* we are not free, nor do we contain interesting data */ if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); fillbuf: bp->b_flags |= B_BUSY; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); splx(s); if (bp->b_bufsize) { allocbuf(bp, 0, 0); } bp->b_flags = B_BUSY; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_wcred = bp->b_rcred = NOCRED; bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; if (bufspace >= maxbufspace) { s = splbio(); bp->b_flags |= B_INVAL; brelse(bp); goto trytofreespace; } return (bp); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; int s = splbio(); bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp) { /* hit */ if (bp->b_lblkno == blkno && bp->b_vp == vp && (bp->b_flags & B_INVAL) == 0) { splx(s); return (bp); } bp = bp->b_hash.le_next; } splx(s); return (0); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t off, toff, tinc; vm_page_t m; if (incore(vp, blkno)) return 1; if (vp->v_mount == 0) return 0; if ((vp->v_vmdata == 0) || (vp->v_flag & VVMIO) == 0) return 0; obj = (vm_object_t) vp->v_vmdata; tinc = PAGE_SIZE; if (tinc > vp->v_mount->mnt_stat.f_iosize) tinc = vp->v_mount->mnt_stat.f_iosize; off = blkno * vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { int mask; m = vm_page_lookup(obj, trunc_page(toff + off)); if (!m) return 0; if (vm_page_is_valid(m, toff + off, tinc) == 0) return 0; } return 1; } /* * Get a block given a specified block and offset into a file/device. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; vm_offset_t off; int nleft; s = splbio(); loop: if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_cache_min) - wakeup((caddr_t) &vm_pages_needed); + pagedaemon_wakeup(); if (bp = incore(vp, blkno)) { if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; if (curproc == pageproc) { bp->b_flags |= B_PDWANTED; wakeup((caddr_t) &cnt.v_free_count); } if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo)) goto loop; splx(s); return (struct buf *) NULL; } bp->b_flags |= B_BUSY | B_CACHE; bremfree(bp); /* * check for size inconsistancies */ if (bp->b_bcount != size) { #if defined(VFS_BIO_DEBUG) printf("getblk: invalid buffer size: %ld\n", bp->b_bcount); #endif bp->b_flags |= B_INVAL; bwrite(bp); goto loop; } splx(s); return (bp); } else { vm_object_t obj; int doingvmio; if ((obj = (vm_object_t) vp->v_vmdata) && (vp->v_flag & VVMIO)) { doingvmio = 1; } else { doingvmio = 0; } if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { if (slpflag || slptimeo) return NULL; goto loop; } /* * It is possible that another buffer has been constituted * during the time that getnewbuf is blocked. This checks * for this possibility, and handles it. */ if (incore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); if (doingvmio) { bp->b_flags |= (B_VMIO | B_CACHE); #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif ++nvmio; } else { if (bp->b_flags & B_VMIO) --nvmio; bp->b_flags &= ~B_VMIO; } splx(s); if (!allocbuf(bp, size, 1)) { s = splbio(); goto loop; } return (bp); } } /* * Get an empty, disassociated buffer of given size. */ struct buf * geteblk(int size) { struct buf *bp; while ((bp = getnewbuf(0, 0, 0)) == 0); allocbuf(bp, size, 0); bp->b_flags |= B_INVAL; return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * * Modify the length of a buffer's underlying buffer storage without * destroying information (unless, of course the buffer is shrinking). */ int allocbuf(struct buf * bp, int size, int vmio) { int s; int newbsize, mbsize; int i; if ((bp->b_flags & B_VMIO) == 0) { /* * Just get anonymous memory from the kernel */ mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; newbsize = round_page(size); if (newbsize == bp->b_bufsize) { bp->b_bcount = size; return 1; } else if (newbsize < bp->b_bufsize) { vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); bufspace -= (bp->b_bufsize - newbsize); } else if (newbsize > bp->b_bufsize) { vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); bufspace += (newbsize - bp->b_bufsize); } } else { vm_page_t m; int desiredpages; newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE; desiredpages = round_page(newbsize) / PAGE_SIZE; if (newbsize == bp->b_bufsize) { bp->b_bcount = size; return 1; } else if (newbsize < bp->b_bufsize) { if (desiredpages < bp->b_npages) { pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages)); for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; s = splhigh(); while ((m->flags & PG_BUSY) || (m->busy != 0)) { m->flags |= PG_WANTED; tsleep(m, PVM, "biodep", 0); } splx(s); if (m->bmapped == 0) { printf("allocbuf: bmapped is zero for page %d\n", i); panic("allocbuf: error"); } --m->bmapped; if (m->bmapped == 0) { PAGE_WAKEUP(m); if (m->valid == 0) { - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } } bp->b_pages[i] = NULL; } bp->b_npages = desiredpages; bufspace -= (bp->b_bufsize - newbsize); } } else { vm_object_t obj; vm_offset_t tinc, off, toff, objoff; int pageindex, curbpnpages; struct vnode *vp; int bsize; vp = bp->b_vp; bsize = vp->v_mount->mnt_stat.f_iosize; if (bp->b_npages < desiredpages) { obj = (vm_object_t) vp->v_vmdata; tinc = PAGE_SIZE; if (tinc > bsize) tinc = bsize; off = bp->b_lblkno * bsize; curbpnpages = bp->b_npages; doretry: for (toff = 0; toff < newbsize; toff += tinc) { int mask; int bytesinpage; pageindex = toff / PAGE_SIZE; objoff = trunc_page(toff + off); if (pageindex < curbpnpages) { int pb; m = bp->b_pages[pageindex]; if (m->offset != objoff) panic("allocbuf: page changed offset??!!!?"); bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (!vm_page_is_valid(m, toff + off, bytesinpage)) { bp->b_flags &= ~B_CACHE; } if ((m->flags & PG_ACTIVE) == 0) vm_page_activate(m); continue; } m = vm_page_lookup(obj, objoff); if (!m) { m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); if (!m) { int j; for (j = bp->b_npages; j < pageindex; j++) { vm_page_t mt = bp->b_pages[j]; PAGE_WAKEUP(mt); if (mt->valid == 0 && mt->bmapped == 0) { vm_page_free(mt); } } VM_WAIT; if (vmio && (bp->b_flags & B_PDWANTED)) { bp->b_flags |= B_INVAL; brelse(bp); return 0; } curbpnpages = bp->b_npages; goto doretry; } m->valid = 0; vm_page_activate(m); } else if ((m->valid == 0) || (m->flags & PG_BUSY)) { int j; int bufferdestroyed = 0; for (j = bp->b_npages; j < pageindex; j++) { vm_page_t mt = bp->b_pages[j]; PAGE_WAKEUP(mt); if (mt->valid == 0 && mt->bmapped == 0) { vm_page_free(mt); } } if (vmio && (bp->b_flags & B_PDWANTED)) { bp->b_flags |= B_INVAL; brelse(bp); VM_WAIT; bufferdestroyed = 1; } s = splbio(); if (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep(m, PRIBIO, "pgtblk", 0); } else if( m->valid == 0 && m->bmapped == 0) { vm_page_free(m); } splx(s); if (bufferdestroyed) return 0; curbpnpages = bp->b_npages; goto doretry; } else { int pb; if ((m->flags & PG_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { int j; for (j = bp->b_npages; j < pageindex; j++) { vm_page_t mt = bp->b_pages[j]; PAGE_WAKEUP(mt); if (mt->valid == 0 && mt->bmapped == 0) { vm_page_free(mt); } } VM_WAIT; if (vmio && (bp->b_flags & B_PDWANTED)) { bp->b_flags |= B_INVAL; brelse(bp); return 0; } curbpnpages = bp->b_npages; goto doretry; } bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; if (!vm_page_is_valid(m, toff + off, bytesinpage)) { bp->b_flags &= ~B_CACHE; } if ((m->flags & PG_ACTIVE) == 0) vm_page_activate(m); m->flags |= PG_BUSY; } bp->b_pages[pageindex] = m; curbpnpages = pageindex + 1; } if (bsize >= PAGE_SIZE) { for (i = bp->b_npages; i < curbpnpages; i++) { m = bp->b_pages[i]; if (m->valid == 0) { bp->b_flags &= ~B_CACHE; } m->bmapped++; PAGE_WAKEUP(m); } } else { if (!vm_page_is_valid(bp->b_pages[0], off, bsize)) bp->b_flags &= ~B_CACHE; bp->b_pages[0]->bmapped++; PAGE_WAKEUP(bp->b_pages[0]); } bp->b_npages = curbpnpages; bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); bp->b_data += off % PAGE_SIZE; } bufspace += (newbsize - bp->b_bufsize); } } bp->b_bufsize = newbsize; bp->b_bcount = size; return 1; } /* * Wait for buffer I/O completion, returning error status. */ int biowait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep((caddr_t) bp, PRIBIO, "biowait", 0); if ((bp->b_flags & B_ERROR) || bp->b_error) { if ((bp->b_flags & B_INVAL) == 0) { bp->b_flags |= B_INVAL; bp->b_dev = NODEV; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); wakeup((caddr_t) bp); } if (!bp->b_error) bp->b_error = EIO; else bp->b_flags |= B_ERROR; splx(s); return (bp->b_error); } else { splx(s); return (0); } } /* * Finish I/O on a buffer, calling an optional function. * This is usually called from interrupt level, so process blocking * is not *a good idea*. */ void biodone(register struct buf * bp) { int s; s = splbio(); - if (bp->b_flags & B_DONE) + if (bp->b_flags & B_DONE) { + splx(s); printf("biodone: buffer already done\n"); + return; + } bp->b_flags |= B_DONE; if ((bp->b_flags & B_READ) == 0) { struct vnode *vp = bp->b_vp; vwakeup(bp); if (vp && (vp->v_numoutput == (nbuf/4)) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } #ifdef BOUNCE_BUFFERS if (bp->b_flags & B_BOUNCE) vm_bounce_free(bp); #endif /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (bp->b_flags & B_VMIO) { int i, resid; vm_offset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; obj = (vm_object_t) vp->v_vmdata; if (!obj) { return; } #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif iosize = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, foff); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif --obj->paging_in_progress; continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (trunc_page(foff) != m->offset) { printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset); } #endif resid = (m->offset + PAGE_SIZE) - foff; if (resid > iosize) resid = iosize; if (resid > 0) { vm_page_set_valid(m, foff, resid); vm_page_set_clean(m, foff, resid); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, off: %d, foff: %d, resid: %d, index: %d\n", m->offset, foff, resid, i); printf(" iosize: %d, lblkno: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno); printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", m->valid, m->dirty, m->bmapped); panic("biodone: page busy < 0\n"); } --m->busy; PAGE_WAKEUP(m); --obj->paging_in_progress; foff += resid; iosize -= resid; } if (obj && obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup((caddr_t) obj); } } /* * For asynchronous completions, release the buffer now. The brelse * checks for B_WANTED and will do the wakeup there if necessary - so * no need to do a wakeup here in the async case. */ if (bp->b_flags & B_ASYNC) { brelse(bp); } else { bp->b_flags &= ~(B_WANTED | B_PDWANTED); wakeup((caddr_t) bp); } splx(s); } int count_lock_queue() { int count; struct buf *bp; count = 0; for (bp = bufqueues[QUEUE_LOCKED].tqh_first; bp != NULL; bp = bp->b_freelist.tqe_next) count++; return (count); } int vfs_update_interval = 30; void vfs_update() { (void) spl0(); while (1) { tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update", hz * vfs_update_interval); vfs_update_wakeup = 0; sync(curproc, NULL, NULL); } } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = (vm_object_t) vp->v_vmdata; vm_offset_t foff; foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, foff); if (!m) { panic("vfs_unbusy_pages: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } --obj->paging_in_progress; --m->busy; PAGE_WAKEUP(m); } if (obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup((caddr_t) obj); } } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i; if (bp->b_flags & B_VMIO) { vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata; vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; int iocount = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = (m->offset + PAGE_SIZE) - foff; if (resid > iocount) resid = iocount; obj->paging_in_progress++; m->busy++; if (clear_modify) { vm_page_test_dirty(m); - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ); + vm_page_protect(m, VM_PROT_READ); } else if (bp->b_bcount >= PAGE_SIZE) { if (m->valid && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } } foff += resid; iocount -= resid; } } } /* * Tell the VM system that the pages associated with this buffer * are dirty. This is in case of the unlikely circumstance that * a buffer has to be destroyed before it is flushed. */ void vfs_dirty_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; int iocount = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = (m->offset + PAGE_SIZE) - foff; if (resid > iocount) resid = iocount; if (resid > 0) { vm_page_set_valid(m, foff, resid); vm_page_set_dirty(m, foff, resid); } PAGE_WAKEUP(m); foff += resid; iocount -= resid; } } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for (pg = from; pg < to; pg += PAGE_SIZE) { tryagain: p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL); if (!p) { VM_WAIT; goto tryagain; } vm_page_wire(p); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p; PAGE_WAKEUP(p); bp->b_npages++; } } void vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for (pg = from; pg < to; pg += PAGE_SIZE) { p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE]; bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0; pmap_kremove(pg); vm_page_free(p); --bp->b_npages; } } void bufstats() { }