diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index e2116974f25b..70fd00d6a87f 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -1,687 +1,685 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id$ + * $Id: vm_glue.c,v 1.3 1994/08/02 07:55:19 davidg Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include extern char kstack[]; int avefree = 0; /* XXX */ int readbuffers = 0; /* XXX allow kgdb to read kernel buffer pool */ /* vm_map_t upages_map; */ void swapout(struct proc *p); int kernacc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; saddr = trunc_page(addr); eaddr = round_page(addr+len); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); return(rv == TRUE); } int useracc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; /* - * XXX - specially disallow access to user page tables - they are - * in the map. - * - * XXX - don't specially disallow access to the user area - treat - * it as incorrectly as elsewhere. + * XXX - check separately to disallow access to user area and user + * page tables - they are in the map. * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was - * only used (as an end address) in trap.c. Use it as an end - * address here too. + * once only used (as an end address) in trap.c. Use it as an end + * address here too. This bogusness has spread. I just fixed + * where it was used as a max in vm_mmap.c. */ - if ((vm_offset_t) addr >= VM_MAXUSER_ADDRESS - || (vm_offset_t) addr + len > VM_MAXUSER_ADDRESS - || (vm_offset_t) addr + len <= (vm_offset_t) addr) { + if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS + || (vm_offset_t) addr + len < (vm_offset_t) addr) { return (FALSE); } rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr+len), prot); return(rv == TRUE); } #ifdef KGDB /* * Change protections on kernel pages from addr to addr+len * (presumably so debugger can plant a breakpoint). * All addresses are assumed to reside in the Sysmap, */ chgkprot(addr, len, rw) register caddr_t addr; int len, rw; { vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; vm_map_protect(kernel_map, trunc_page(addr), round_page(addr+len), prot, FALSE); } #endif void vslock(addr, len) caddr_t addr; u_int len; { vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr+len), FALSE); } void vsunlock(addr, len, dirtied) caddr_t addr; u_int len; int dirtied; { #ifdef lint dirtied++; #endif lint vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr+len), TRUE); } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. * NOTE: the kernel stack may be at a different location in the child * process, and thus addresses of automatic variables may be invalid * after cpu_fork returns in the child process. We do nothing here * after cpu_fork returns. */ int vm_fork(p1, p2, isvfork) register struct proc *p1, *p2; int isvfork; { register struct user *up; vm_offset_t addr, ptaddr; int i; struct vm_map *vp; while( cnt.v_free_count < cnt.v_free_min) VM_WAIT; /* * avoid copying any of the parent's pagetables or other per-process * objects that reside in the map by marking all of them non-inheritable */ (void)vm_map_inherit(&p1->p_vmspace->vm_map, UPT_MIN_ADDRESS - UPAGES * NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE); p2->p_vmspace = vmspace_fork(p1->p_vmspace); #ifdef SYSVSHM if (p1->p_vmspace->vm_shm) shmfork(p1, p2, isvfork); #endif /* * Allocate a wired-down (for now) pcb and kernel stack for the process */ addr = (vm_offset_t) kstack; vp = &p2->p_vmspace->vm_map; /* ream out old pagetables and kernel stack */ (void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr); /* get new pagetables and kernel stack */ (void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE); /* force in the page table encompassing the UPAGES */ ptaddr = trunc_page((u_int)vtopte(addr)); vm_map_pageable(vp, ptaddr, ptaddr + NBPG, FALSE); /* and force in (demand-zero) the UPAGES */ vm_map_pageable(vp, addr, addr + UPAGES * NBPG, FALSE); /* get a kernel virtual address for the UPAGES for this proc */ up = (struct user *)kmem_alloc_pageable(kernel_map, UPAGES * NBPG); /* and force-map the upages into the kernel pmap */ for (i = 0; i < UPAGES; i++) pmap_enter(vm_map_pmap(kernel_map), ((vm_offset_t) up) + NBPG * i, pmap_extract(vp->pmap, addr + NBPG * i), VM_PROT_READ|VM_PROT_WRITE, 1); /* and allow the UPAGES page table entry to be paged (at the vm system level) */ vm_map_pageable(vp, ptaddr, ptaddr + NBPG, TRUE); p2->p_addr = up; /* * p_stats and p_sigacts currently point at fields * in the user struct but not at &u, instead at p_addr. * Copy p_sigacts and parts of p_stats; zero the rest * of p_stats (statistics). */ p2->p_stats = &up->u_stats; p2->p_sigacts = &up->u_sigacts; up->u_sigacts = *p1->p_sigacts; bzero(&up->u_stats.pstat_startzero, (unsigned) ((caddr_t)&up->u_stats.pstat_endzero - (caddr_t)&up->u_stats.pstat_startzero)); bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, ((caddr_t)&up->u_stats.pstat_endcopy - (caddr_t)&up->u_stats.pstat_startcopy)); /* * cpu_fork will copy and update the kernel stack and pcb, * and make the child ready to run. It marks the child * so that it can return differently than the parent. * It returns twice, once in the parent process and * once in the child. */ return (cpu_fork(p1, p2)); } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. */ void vm_init_limits(p) register struct proc *p; { - int tmp; + int rss_limit; /* * Set up the initial limits on process VM. - * Set the maximum resident set size to be all - * of (reasonably) available memory. This causes - * any single, large process to start random page - * replacement once it fills memory. + * Set the maximum resident set size to be half + * of (reasonably) available memory. Since this + * is a soft limit, it comes into effect only + * when the system is out of memory - half of + * main memory helps to favor smaller processes, + * and reduces thrashing of the object cache. */ p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; - tmp = ((2 * cnt.v_free_count) / 3) - 32; - if (cnt.v_free_count < 512) - tmp = cnt.v_free_count; - p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(tmp); + /* limit the limit to no less than 128K */ + rss_limit = max(cnt.v_free_count / 2, 32); + p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } #ifdef DEBUG int enableswap = 1; int swapdebug = 0; #define SDB_FOLLOW 1 #define SDB_SWAPIN 2 #define SDB_SWAPOUT 4 #endif void faultin(p) struct proc *p; { vm_offset_t i; vm_offset_t vaddr, ptaddr; vm_offset_t v, v1; struct user *up; int s; int opflag; if ((p->p_flag & P_INMEM) == 0) { int rv0, rv1; vm_map_t map; ++p->p_lock; map = &p->p_vmspace->vm_map; /* force the page table encompassing the kernel stack (upages) */ ptaddr = trunc_page((u_int)vtopte(kstack)); vm_map_pageable(map, ptaddr, ptaddr + NBPG, FALSE); /* wire in the UPAGES */ vm_map_pageable(map, (vm_offset_t) kstack, (vm_offset_t) kstack + UPAGES * NBPG, FALSE); /* and map them nicely into the kernel pmap */ for (i = 0; i < UPAGES; i++) { vm_offset_t off = i * NBPG; vm_offset_t pa = (vm_offset_t) pmap_extract(&p->p_vmspace->vm_pmap, (vm_offset_t) kstack + off); pmap_enter(vm_map_pmap(kernel_map), ((vm_offset_t)p->p_addr) + off, pa, VM_PROT_READ|VM_PROT_WRITE, 1); } /* and let the page table pages go (at least above pmap level) */ vm_map_pageable(map, ptaddr, ptaddr + NBPG, TRUE); s = splhigh(); if (p->p_stat == SRUN) setrunqueue(p); p->p_flag |= P_INMEM; /* undo the effect of setting SLOCK above */ --p->p_lock; splx(s); } } int swapinreq; int percentactive; /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. */ void scheduler() { register struct proc *p; register int pri; struct proc *pp; int ppri; vm_offset_t addr; int lastidle, lastrun; int curidle, currun; int forceload; int percent; int ntries; lastidle = 0; lastrun = 0; loop: ntries = 0; vmmeter(); curidle = cp_time[CP_IDLE]; currun = cp_time[CP_USER] + cp_time[CP_SYS] + cp_time[CP_NICE]; percent = (100*(currun-lastrun)) / ( 1 + (currun-lastrun) + (curidle-lastidle)); lastrun = currun; lastidle = curidle; if( percent > 100) percent = 100; percentactive = percent; if( percentactive < 25) forceload = 1; else forceload = 0; loop1: pp = NULL; ppri = INT_MIN; for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) { int mempri; pri = p->p_swtime + p->p_slptime - p->p_nice * 8; mempri = pri > 0 ? pri : 0; /* * if this process is higher priority and there is * enough space, then select this process instead * of the previous selection. */ if (pri > ppri && (((cnt.v_free_count + (mempri * (4*PAGE_SIZE) / PAGE_SIZE) >= (p->p_vmspace->vm_swrss)) || (ntries > 0 && forceload)))) { pp = p; ppri = pri; } } } if ((pp == NULL) && (ntries == 0) && forceload) { ++ntries; goto loop1; } /* * Nothing to do, back to sleep */ if ((p = pp) == NULL) { tsleep((caddr_t)&proc0, PVM, "sched", 0); goto loop; } /* * We would like to bring someone in. (only if there is space). */ /* printf("swapin: %d, free: %d, res: %d, min: %d\n", p->p_pid, cnt.v_free_count, cnt.v_free_reserved, cnt.v_free_min); */ (void) splhigh(); if ((forceload && (cnt.v_free_count > (cnt.v_free_reserved + UPAGES + 1))) || (cnt.v_free_count >= cnt.v_free_min)) { spl0(); faultin(p); p->p_swtime = 0; goto loop; } /* * log the memory shortage */ swapinreq += p->p_vmspace->vm_swrss; /* * Not enough memory, jab the pageout daemon and wait til the * coast is clear. */ if( cnt.v_free_count < cnt.v_free_min) { VM_WAIT; } else { tsleep((caddr_t)&proc0, PVM, "sched", 0); } (void) spl0(); goto loop; } #define swappable(p) \ (((p)->p_lock == 0) && \ ((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO)) == P_INMEM) extern int vm_pageout_free_min; /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_threads() { register struct proc *p; struct proc *outp, *outp2; int outpri, outpri2; int tpri; int didswap = 0; int swapneeded = swapinreq; extern int maxslp; int runnablenow; int s; swapmore: runnablenow = 0; outp = outp2 = NULL; outpri = outpri2 = INT_MIN; for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { if (!swappable(p)) continue; switch (p->p_stat) { case SRUN: ++runnablenow; /* * count the process as being in a runnable state */ if ((tpri = p->p_swtime + p->p_nice * 8) > outpri2) { outp2 = p; outpri2 = tpri; } continue; case SSLEEP: case SSTOP: /* * do not swapout a process that is waiting for VM datastructures * there is a possible deadlock. */ if (!lock_try_write( &p->p_vmspace->vm_map.lock)) { continue; } vm_map_unlock( &p->p_vmspace->vm_map); if (p->p_slptime > maxslp) { swapout(p); didswap++; } else if ((tpri = p->p_slptime + p->p_nice * 8) > outpri) { outp = p; outpri = tpri ; } continue; } } /* * We swapout only if there are more than two runnable processes or if * another process needs some space to swapin. */ if ((swapinreq || ((percentactive > 90) && (runnablenow > 2))) && (((cnt.v_free_count + cnt.v_inactive_count) <= (cnt.v_free_target + cnt.v_inactive_target)) || (cnt.v_free_count < cnt.v_free_min))) { if ((p = outp) == 0) { p = outp2; } if (p) { swapout(p); didswap = 1; } } /* * if we previously had found a process to swapout, and we need to swapout * more then try again. */ #if 0 if( p && swapinreq) goto swapmore; #endif /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) { if (swapneeded) wakeup((caddr_t)&proc0); swapinreq = 0; } } void swapout(p) register struct proc *p; { vm_offset_t addr; struct pmap *pmap = &p->p_vmspace->vm_pmap; vm_map_t map = &p->p_vmspace->vm_map; vm_offset_t ptaddr; int i; ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = p->p_vmspace->vm_pmap.pm_stats.resident_count; /* * and decrement the amount of needed space */ swapinreq -= min(swapinreq, p->p_vmspace->vm_pmap.pm_stats.resident_count); (void) splhigh(); p->p_flag &= ~P_INMEM; if (p->p_stat == SRUN) remrq(p); (void) spl0(); ++p->p_lock; /* let the upages be paged */ pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr, ((vm_offset_t) p->p_addr) + UPAGES * NBPG); vm_map_pageable(map, (vm_offset_t) kstack, (vm_offset_t) kstack + UPAGES * NBPG, TRUE); --p->p_lock; p->p_swtime = 0; } /* * The rest of these routines fake thread handling */ #ifndef assert_wait void assert_wait(event, ruptible) int event; boolean_t ruptible; { #ifdef lint ruptible++; #endif curproc->p_thread = event; } #endif void thread_block(char *msg) { if (curproc->p_thread) tsleep((caddr_t)curproc->p_thread, PVM, msg, 0); } void thread_sleep_(event, lock, wmesg) int event; simple_lock_t lock; char *wmesg; { curproc->p_thread = event; simple_unlock(lock); if (curproc->p_thread) { tsleep((caddr_t)event, PVM, wmesg, 0); } } #ifndef thread_wakeup void thread_wakeup(event) int event; { wakeup((caddr_t)event); } #endif /* * DEBUG stuff */ int indent = 0; #include /* see subr_prf.c */ /*ARGSUSED2*/ void #if __STDC__ iprintf(const char *fmt, ...) #else iprintf(fmt /* , va_alist */) char *fmt; /* va_dcl */ #endif { register int i; va_list ap; for (i = indent; i >= 8; i -= 8) printf("\t"); while (--i >= 0) printf(" "); va_start(ap, fmt); printf("%r", fmt, ap); va_end(ap); } diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index d0fb654c9549..2c1624b251cb 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1,837 +1,837 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 - * $Id$ + * $Id: vm_mmap.c,v 1.3 1994/08/02 07:55:28 davidg Exp $ */ /* * Mapped file (mmap) interface to VM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG int mmapdebug = 0; #define MDB_FOLLOW 0x01 #define MDB_SYNC 0x02 #define MDB_MAPIT 0x04 #endif struct sbrk_args { int incr; }; /* ARGSUSED */ int sbrk(p, uap, retval) struct proc *p; struct sbrk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } struct sstk_args { int incr; }; /* ARGSUSED */ int sstk(p, uap, retval) struct proc *p; struct sstk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) struct getpagesize_args { int dummy; }; /* ARGSUSED */ int ogetpagesize(p, uap, retval) struct proc *p; struct getpagesize_args *uap; int *retval; { *retval = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ struct mmap_args { caddr_t addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #ifdef COMPAT_43 struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; int ommap(p, uap, retval) struct proc *p; register struct ommap_args *uap; int *retval; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC|PROT_WRITE, PROT_READ, PROT_EXEC|PROT_READ, PROT_WRITE|PROT_READ, PROT_EXEC|PROT_WRITE|PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 #define OMAP_INHERIT 0x0800 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot&0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; if (uap->flags & OMAP_INHERIT) nargs.flags |= MAP_INHERIT; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(p, &nargs, retval)); } #endif int mmap(p, uap, retval) struct proc *p; register struct mmap_args *uap; int *retval; { register struct filedesc *fdp = p->p_fd; register struct file *fp; struct vnode *vp; vm_offset_t addr; vm_size_t size; vm_prot_t prot, maxprot; caddr_t handle; int flags, error; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("mmap(%d): addr %x len %x pro %x flg %x fd %d pos %x\n", p->p_pid, uap->addr, uap->len, prot, flags, uap->fd, (vm_offset_t)uap->pos); #endif /* * Address (if FIXED) must be page aligned. * Size is implicitly rounded to a page boundary. */ addr = (vm_offset_t) uap->addr; if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) || (ssize_t)uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); size = (vm_size_t) round_page(uap->len); /* * Check for illegal addresses. Watch out for address wrap... * Note that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { - if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif - if (addr > addr + size) + if (addr + size < addr) return (EINVAL); } /* * XXX if no hint provided for a non-fixed mapping place it after * the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ if (addr == 0 && (flags & MAP_FIXED) == 0) addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; } else { /* * Mapping file, get fp for validation. * Obtain vnode and make sure it is of appropriate type. */ if (((unsigned)uap->fd) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) return (EINVAL); vp = (struct vnode *)fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) return (EINVAL); /* * XXX hack to handle use of /dev/zero to map anon * memory (ala SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; } else { /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? * What if proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) return (EACCES); if (flags & MAP_SHARED) { if (fp->f_flag & FWRITE) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) return (EACCES); } else maxprot |= VM_PROT_WRITE; handle = (caddr_t)vp; } } error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, (vm_offset_t)uap->pos); if (error == 0) *retval = (int)addr; return (error); } struct msync_args { caddr_t addr; int len; }; int msync(p, uap, retval) struct proc *p; struct msync_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; vm_map_t map; int rv; boolean_t syncio, invalidate; #ifdef DEBUG if (mmapdebug & (MDB_FOLLOW|MDB_SYNC)) printf("msync(%d): addr %x len %x\n", p->p_pid, uap->addr, uap->len); #endif if (((int)uap->addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) return (EINVAL); map = &p->p_vmspace->vm_map; addr = (vm_offset_t)uap->addr; size = (vm_size_t)uap->len; /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we * don't really keep track of individual mmaps so we approximate * by flushing the range of the map entry containing addr. * This can be incorrect if the region splits or is coalesced * with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); vm_map_unlock_read(map); if (rv) return (EINVAL); addr = entry->start; size = entry->end - entry->start; } #ifdef DEBUG if (mmapdebug & MDB_SYNC) printf("msync: cleaning/flushing address range [%x-%x)\n", addr, addr+size); #endif /* * Could pass this in as a third flag argument to implement * Sun's MS_ASYNC. */ syncio = TRUE; /* * XXX bummer, gotta flush all cached pages to ensure * consistency with the file system cache. Otherwise, we could * pass this in to implement Sun's MS_INVALIDATE. */ invalidate = TRUE; /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr+size, syncio, invalidate); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); } struct munmap_args { caddr_t addr; int len; }; int munmap(p, uap, retval) register struct proc *p; register struct munmap_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; vm_map_t map; #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("munmap(%d): addr %x len %x\n", p->p_pid, uap->addr, uap->len); #endif addr = (vm_offset_t) uap->addr; if ((addr & PAGE_MASK) || uap->len < 0) return(EINVAL); size = (vm_size_t) round_page(uap->len); if (size == 0) return(0); /* * Check for illegal addresses. Watch out for address wrap... * Note that VM_*_ADDRESS are not constants due to casts (argh). */ - if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif - if (addr > addr + size) + if (addr + size < addr) return (EINVAL); map = &p->p_vmspace->vm_map; /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) return(EINVAL); /* returns nothing but KERN_SUCCESS anyway */ (void) vm_map_remove(map, addr, addr+size); return(0); } void munmapfd(fd) int fd; { #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("munmapfd(%d): fd %d\n", curproc->p_pid, fd); #endif /* * XXX should vm_deallocate any regions mapped to this file */ curproc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; } struct mprotect_args { caddr_t addr; int len; int prot; }; int mprotect(p, uap, retval) struct proc *p; struct mprotect_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; register vm_prot_t prot; #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("mprotect(%d): addr %x len %x prot %d\n", p->p_pid, uap->addr, uap->len, uap->prot); #endif addr = (vm_offset_t)uap->addr; if ((addr & PAGE_MASK) || uap->len < 0) return(EINVAL); size = (vm_size_t)uap->len; prot = uap->prot & VM_PROT_ALL; switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } struct madvise_args { caddr_t addr; int len; int behav; }; /* ARGSUSED */ int madvise(p, uap, retval) struct proc *p; struct madvise_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } struct mincore_args { caddr_t addr; int len; char *vec; }; /* ARGSUSED */ int mincore(p, uap, retval) struct proc *p; struct mincore_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } struct mlock_args { caddr_t addr; size_t len; }; int mlock(p, uap, retval) struct proc *p; struct mlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; int error; extern int vm_page_max_wired; #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("mlock(%d): addr %x len %x\n", p->p_pid, uap->addr, uap->len); #endif addr = (vm_offset_t)uap->addr; if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) return (EINVAL); size = round_page((vm_size_t)uap->len); if (atop(size) + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef pmap_wired_count if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (EAGAIN); #else if (error = suser(p->p_ucred, &p->p_acflag)) return (error); #endif error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } struct munlock_args { caddr_t addr; size_t len; }; int munlock(p, uap, retval) struct proc *p; struct munlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; int error; #ifdef DEBUG if (mmapdebug & MDB_FOLLOW) printf("munlock(%d): addr %x len %x\n", p->p_pid, uap->addr, uap->len); #endif addr = (vm_offset_t)uap->addr; if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) return (EINVAL); #ifndef pmap_wired_count if (error = suser(p->p_ucred, &p->p_acflag)) return (error); #endif size = round_page((vm_size_t)uap->len); error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * Internal version of mmap. * Currently used by mmap, exec, and sys5 shared memory. * Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) register vm_map_t map; register vm_offset_t *addr; register vm_size_t size; vm_prot_t prot, maxprot; register int flags; caddr_t handle; /* XXX should be vp */ vm_offset_t foff; { register vm_pager_t pager; boolean_t fitit; vm_object_t object; struct vnode *vp = NULL; int type; int rv = KERN_SUCCESS; if (size == 0) return (0); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { fitit = FALSE; (void)vm_deallocate(map, *addr, size); } /* * Lookup/allocate pager. All except an unnamed anonymous lookup * gain a reference to ensure continued existance of the object. * (XXX the exception is to appease the pageout daemon) */ if (flags & MAP_ANON) type = PG_DFLT; else { vp = (struct vnode *)handle; if (vp->v_type == VCHR) { type = PG_DEVICE; handle = (caddr_t)vp->v_rdev; } else type = PG_VNODE; } pager = vm_pager_allocate(type, handle, size, prot, foff); if (pager == NULL) return (type == PG_DEVICE ? EINVAL : ENOMEM); /* * Find object and release extra reference gained by lookup */ object = vm_object_lookup(pager); vm_object_deallocate(object); /* * Anonymous memory. */ if (flags & MAP_ANON) { rv = vm_allocate_with_pager(map, addr, size, fitit, pager, foff, TRUE); if (rv != KERN_SUCCESS) { if (handle == NULL) vm_pager_deallocate(pager); else vm_object_deallocate(object); goto out; } /* * Don't cache anonymous objects. * Loses the reference gained by vm_pager_allocate. * Note that object will be NULL when handle == NULL, * this is ok since vm_allocate_with_pager has made * sure that these objects are uncached. */ (void) pager_cache(object, FALSE); #ifdef DEBUG if (mmapdebug & MDB_MAPIT) printf("vm_mmap(%d): ANON *addr %x size %x pager %x\n", curproc->p_pid, *addr, size, pager); #endif } /* * Must be a mapped file. * Distinguish between character special and regular files. */ else if (vp->v_type == VCHR) { rv = vm_allocate_with_pager(map, addr, size, fitit, pager, foff, FALSE); /* * Uncache the object and lose the reference gained * by vm_pager_allocate(). If the call to * vm_allocate_with_pager() was sucessful, then we * gained an additional reference ensuring the object * will continue to exist. If the call failed then * the deallocate call below will terminate the * object which is fine. */ (void) pager_cache(object, FALSE); if (rv != KERN_SUCCESS) goto out; } /* * A regular file */ else { #ifdef DEBUG if (object == NULL) printf("vm_mmap: no object: vp %x, pager %x\n", vp, pager); #endif /* * Map it directly. * Allows modifications to go out to the vnode. */ if (flags & MAP_SHARED) { rv = vm_allocate_with_pager(map, addr, size, fitit, pager, foff, FALSE); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); goto out; } /* * Don't cache the object. This is the easiest way * of ensuring that data gets back to the filesystem * because vnode_pager_deallocate() will fsync the * vnode. pager_cache() will lose the extra ref. */ if (prot & VM_PROT_WRITE) pager_cache(object, FALSE); else vm_object_deallocate(object); } /* * Copy-on-write of file. Two flavors. * MAP_COPY is true COW, you essentially get a snapshot of * the region at the time of mapping. MAP_PRIVATE means only * that your changes are not reflected back to the object. * Changes made by others will be seen. */ else { vm_map_t tmap; vm_offset_t off; /* locate and allocate the target address space */ rv = vm_map_find(map, NULL, (vm_offset_t)0, addr, size, fitit); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); goto out; } tmap = vm_map_create(pmap_create(size), VM_MIN_ADDRESS, VM_MIN_ADDRESS+size, TRUE); off = VM_MIN_ADDRESS; rv = vm_allocate_with_pager(tmap, &off, size, TRUE, pager, foff, FALSE); if (rv != KERN_SUCCESS) { vm_object_deallocate(object); vm_map_deallocate(tmap); goto out; } /* * (XXX) * MAP_PRIVATE implies that we see changes made by * others. To ensure that we need to guarentee that * no copy object is created (otherwise original * pages would be pushed to the copy object and we * would never see changes made by others). We * totally sleeze it right now by marking the object * internal temporarily. */ if ((flags & MAP_COPY) == 0) object->flags |= OBJ_INTERNAL; rv = vm_map_copy(map, tmap, *addr, size, off, FALSE, FALSE); object->flags &= ~OBJ_INTERNAL; /* * (XXX) * My oh my, this only gets worse... * Force creation of a shadow object so that * vm_map_fork will do the right thing. */ if ((flags & MAP_COPY) == 0) { vm_map_t tmap; vm_map_entry_t tentry; vm_object_t tobject; vm_offset_t toffset; vm_prot_t tprot; boolean_t twired, tsu; tmap = map; vm_map_lookup(&tmap, *addr, VM_PROT_WRITE, &tentry, &tobject, &toffset, &tprot, &twired, &tsu); vm_map_lookup_done(tmap, tentry); } /* * (XXX) * Map copy code cannot detect sharing unless a * sharing map is involved. So we cheat and write * protect everything ourselves. */ vm_object_pmap_copy(object, foff, foff + size); vm_object_deallocate(object); vm_map_deallocate(tmap); if (rv != KERN_SUCCESS) goto out; } #ifdef DEBUG if (mmapdebug & MDB_MAPIT) printf("vm_mmap(%d): FILE *addr %x size %x pager %x\n", curproc->p_pid, *addr, size, pager); #endif } /* * Correct protection (default is VM_PROT_ALL). * If maxprot is different than prot, we must set both explicitly. */ rv = KERN_SUCCESS; if (maxprot != VM_PROT_ALL) rv = vm_map_protect(map, *addr, *addr+size, maxprot, TRUE); if (rv == KERN_SUCCESS && prot != maxprot) rv = vm_map_protect(map, *addr, *addr+size, prot, FALSE); if (rv != KERN_SUCCESS) { (void) vm_deallocate(map, *addr, size); goto out; } /* * Shared memory is also shared with children. */ if (flags & MAP_SHARED) { rv = vm_map_inherit(map, *addr, *addr+size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { (void) vm_deallocate(map, *addr, size); goto out; } } out: #ifdef DEBUG if (mmapdebug & MDB_MAPIT) printf("vm_mmap: rv %d\n", rv); #endif switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 57ffbe843e6c..9dc605490af5 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -1,270 +1,271 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id$ + * $Id: vm_page.h,v 1.3 1994/08/02 07:55:32 davidg Exp $ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several lists: * * A hash table bucket used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). */ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO * queue or free list (P) */ TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ vm_object_t object; /* which object am I in (O,P)*/ vm_offset_t offset; /* offset into object (O,P) */ u_short wire_count; /* wired down maps refs (P) */ u_short flags; /* see below */ short hold_count; /* page hold count */ u_short act_count; /* page usage count */ vm_offset_t phys_addr; /* physical address of page */ }; /* * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. */ #define PG_INACTIVE 0x0001 /* page is in inactive list (P) */ #define PG_ACTIVE 0x0002 /* page is in active list (P) */ #define PG_LAUNDRY 0x0004 /* page is being cleaned now (P)*/ #define PG_CLEAN 0x0008 /* page has not been modified */ #define PG_BUSY 0x0010 /* page is in transit (O) */ #define PG_WANTED 0x0020 /* someone is waiting for page (O) */ #define PG_TABLED 0x0040 /* page is in VP table (O) */ #define PG_COPYONWRITE 0x0080 /* must copy page before changing (O) */ #define PG_FICTITIOUS 0x0100 /* physical page doesn't exist (O) */ #define PG_FAKE 0x0200 /* page is placeholder for pagein (O) */ #define PG_FILLED 0x0400 /* client flag to set when filled */ #define PG_DIRTY 0x0800 /* client flag to set when dirty */ +#define PG_REFERENCED 0x1000 /* page has been referenced */ #define PG_PAGEROWNED 0x4000 /* DEBUG: async paging op in progress */ #define PG_PTPAGE 0x8000 /* DEBUG: is a user page table page */ #if VM_PAGE_DEBUG #define VM_PAGE_CHECK(mem) { \ if ((((unsigned int) mem) < ((unsigned int) &vm_page_array[0])) || \ (((unsigned int) mem) > \ ((unsigned int) &vm_page_array[last_page-first_page])) || \ ((mem->flags & (PG_ACTIVE | PG_INACTIVE)) == \ (PG_ACTIVE | PG_INACTIVE))) \ panic("vm_page_check: not valid!"); \ } #else /* VM_PAGE_DEBUG */ #define VM_PAGE_CHECK(mem) #endif /* VM_PAGE_DEBUG */ #ifdef KERNEL /* * Each pageable resident page falls into one of three lists: * * free * Available for allocation now. * inactive * Not referenced in any map, but still has an * object/offset-page mapping, and may be dirty. * This is the list of pages that should be * paged out next. * active * A list of pages which have been placed in * at least one physical map. This list is * ordered, in LRU-like fashion. */ extern struct pglist vm_page_queue_free; /* memory free queue */ extern struct pglist vm_page_queue_active; /* active memory queue */ extern struct pglist vm_page_queue_inactive; /* inactive memory queue */ extern vm_page_t vm_page_array; /* First resident page in table */ extern long first_page; /* first physical page number */ /* ... represented in vm_page_array */ extern long last_page; /* last physical page number */ /* ... represented in vm_page_array */ /* [INCLUSIVE] */ extern vm_offset_t first_phys_addr; /* physical address for first_page */ extern vm_offset_t last_phys_addr; /* physical address for last_page */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) #define IS_VM_PHYSADDR(pa) \ ((pa) >= first_phys_addr && (pa) <= last_phys_addr) #define PHYS_TO_VM_PAGE(pa) \ (&vm_page_array[atop(pa) - first_page ]) extern simple_lock_data_t vm_page_queue_lock; /* lock on active and inactive page queues */ extern /* lock on free page queue */ simple_lock_data_t vm_page_queue_free_lock; /* * Functions implemented as macros */ #define PAGE_ASSERT_WAIT(m, interruptible) { \ (m)->flags |= PG_WANTED; \ assert_wait((int) (m), (interruptible)); \ } #define PAGE_WAKEUP(m) { \ (m)->flags &= ~PG_BUSY; \ if ((m)->flags & PG_WANTED) { \ (m)->flags &= ~PG_WANTED; \ wakeup((caddr_t) (m)); \ } \ } #define vm_page_lock_queues() simple_lock(&vm_page_queue_lock) #define vm_page_unlock_queues() simple_unlock(&vm_page_queue_lock) #define vm_page_set_modified(m) { (m)->flags &= ~PG_CLEAN; } #define VM_PAGE_INIT(mem, object, offset) { \ (mem)->flags = PG_BUSY | PG_CLEAN | PG_FAKE; \ vm_page_insert((mem), (object), (offset)); \ (mem)->wire_count = 0; \ (mem)->hold_count = 0; \ (mem)->act_count = 0; \ } void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_offset_t)); void vm_page_copy __P((vm_page_t, vm_page_t)); void vm_page_deactivate __P((vm_page_t)); void vm_page_free __P((vm_page_t)); void vm_page_insert __P((vm_page_t, vm_object_t, vm_offset_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_offset_t)); void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_offset_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); void vm_page_unwire __P((vm_page_t)); void vm_page_wire __P((vm_page_t)); boolean_t vm_page_zero_fill __P((vm_page_t)); /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ static inline void vm_page_hold(mem) vm_page_t mem; { mem->hold_count++; } static inline void vm_page_unhold(mem) vm_page_t mem; { if( --mem->hold_count < 0) panic("vm_page_unhold: hold count < 0!!!"); } #endif /* KERNEL */ #endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 99d7365900ca..7c092bb1cf71 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1,790 +1,791 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.4 1994/08/01 11:25:45 davidg Exp $ + * $Id: vm_pageout.c,v 1.5 1994/08/02 07:55:33 davidg Exp $ */ /* * The proverbial page-out daemon. */ #include #include #include #include #include #include #include #include extern vm_map_t kmem_map; int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pagescanner; /* Event on which pagescanner sleeps */ int vm_pageout_free_min = 0; /* Stop pageout to wait for pagers at this free level */ int vm_pageout_pages_needed = 0; /* flag saying that the pageout daemon needs pages */ int vm_page_pagesfreed; extern int npendingio; extern int hz; int vm_pageout_proc_limit; extern int nswiodone; extern int swap_pager_full; extern int swap_pager_ready(); #define MAXREF 32767 #define MAXSCAN 512 /* maximum number of pages to scan in active queue */ /* set the "clock" hands to be (MAXSCAN * 4096) Bytes */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_MAX 300 #define LOWATER ((2048*1024)/NBPG) #define VM_PAGEOUT_PAGE_COUNT 8 +int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; static vm_offset_t vm_space_needed; int vm_pageout_req_do_stats; int vm_page_max_wired = 0; /* XXX max # of wired pages system-wide */ /* * vm_pageout_clean: * cleans a vm_page */ int vm_pageout_clean(m, sync) register vm_page_t m; int sync; { /* * Clean the page and remove it from the * laundry. * * We set the busy bit to cause * potential page faults on this page to * block. * * And we set pageout-in-progress to keep * the object from disappearing during * pageout. This guarantees that the * page won't move from the inactive * queue. (However, any other page on * the inactive queue may move!) */ register vm_object_t object; register vm_pager_t pager; int pageout_status[VM_PAGEOUT_PAGE_COUNT]; vm_page_t ms[VM_PAGEOUT_PAGE_COUNT]; int pageout_count; int anyok=0; int i; vm_offset_t offset = m->offset; object = m->object; if (!object) { printf("pager: object missing\n"); return 0; } /* * Try to collapse the object before * making a pager for it. We must * unlock the page queues first. * We try to defer the creation of a pager * until all shadows are not paging. This * allows vm_object_collapse to work better and * helps control swap space size. * (J. Dyson 11 Nov 93) */ if (!object->pager && cnt.v_free_count < vm_pageout_free_min) return 0; if (!object->pager && object->shadow && object->shadow->paging_in_progress) return 0; if( !sync) { if (object->shadow) { vm_object_collapse(object); if (!vm_page_lookup(object, offset)) return 0; } if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { return 0; } } pageout_count = 1; ms[0] = m; if( pager = object->pager) { - for(i=1;iflags & (PG_CLEAN|PG_INACTIVE|PG_BUSY)) == PG_INACTIVE) || (( ms[i]->flags & PG_CLEAN) == 0 && sync == VM_PAGEOUT_FORCE)) && (ms[i]->wire_count == 0) && (ms[i]->hold_count == 0)) pageout_count++; else break; } else break; } for(i=0;iflags |= PG_BUSY; pmap_page_protect(VM_PAGE_TO_PHYS(ms[i]), VM_PROT_READ); } object->paging_in_progress += pageout_count; cnt.v_pageouts += pageout_count; } else { m->flags |= PG_BUSY; pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ); cnt.v_pageouts++; object->paging_in_progress++; pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, object->size, VM_PROT_ALL, 0); if (pager != NULL) { vm_object_setpager(object, pager, 0, FALSE); } } /* * If there is no pager for the page, * use the default pager. If there's * no place to put the page at the * moment, leave it in the laundry and * hope that there will be paging space * later. */ if ((pager && pager->pg_type == PG_SWAP) || cnt.v_free_count >= vm_pageout_free_min) { if( pageout_count == 1) { pageout_status[0] = pager ? vm_pager_put(pager, m, ((sync || (object == kernel_object)) ? TRUE: FALSE)) : VM_PAGER_FAIL; } else { if( !pager) { for(i=0;iflags &= ~PG_LAUNDRY; ++anyok; break; case VM_PAGER_PEND: ms[i]->flags &= ~PG_LAUNDRY; ++anyok; break; case VM_PAGER_BAD: /* * Page outside of range of object. * Right now we essentially lose the * changes by pretending it worked. */ ms[i]->flags &= ~PG_LAUNDRY; ms[i]->flags |= PG_CLEAN; pmap_clear_modify(VM_PAGE_TO_PHYS(ms[i])); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then * reactivate the page so it doesn't * clog the inactive list. (We will * try paging out it again later). */ if (ms[i]->flags & PG_INACTIVE) vm_page_activate(ms[i]); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave * the page busy to block all other accesses. * Also, leave the paging in progress * indicator set so that we don't attempt an * object collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { PAGE_WAKEUP(ms[i]); if (--object->paging_in_progress == 0) wakeup((caddr_t) object); if (pmap_is_referenced(VM_PAGE_TO_PHYS(ms[i]))) { pmap_clear_reference(VM_PAGE_TO_PHYS(ms[i])); if( ms[i]->flags & PG_INACTIVE) vm_page_activate(ms[i]); } } } return anyok; } /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * shadows. * * The object and map must be locked. */ int vm_pageout_object_deactivate_pages(map, object, count) vm_map_t map; vm_object_t object; int count; { register vm_page_t p, next; int rcount; int s; int dcount; dcount = 0; if (count == 0) count = 1; if (object->shadow) { int scount = count; if( object->shadow->ref_count > 1) scount /= object->shadow->ref_count; if( scount) dcount += vm_pageout_object_deactivate_pages(map, object->shadow, scount); } if (object->paging_in_progress) return dcount; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = object->memq.tqh_first; while (p && (rcount-- > 0)) { next = p->listq.tqe_next; vm_page_lock_queues(); /* * if a page is active, not wired and is in the processes pmap, * then deactivate the page. */ if ((p->flags & (PG_ACTIVE|PG_BUSY)) == PG_ACTIVE && p->wire_count == 0 && p->hold_count == 0 && pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p))) { p->act_count -= min(p->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we deactivate */ if (!p->act_count) { vm_page_deactivate(p); pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); /* * else if on the next go-around we will deactivate the page * we need to place the page on the end of the queue to age * the other pages in memory. */ } else { TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); TAILQ_REMOVE(&object->memq, p, listq); TAILQ_INSERT_TAIL(&object->memq, p, listq); } /* * see if we are done yet */ if (p->flags & PG_INACTIVE) { --count; ++dcount; if (count <= 0 && cnt.v_inactive_count > cnt.v_inactive_target) { vm_page_unlock_queues(); return dcount; } } } else { /* * Move the page to the bottom of the queue. */ pmap_clear_reference(VM_PAGE_TO_PHYS(p)); if (p->act_count < ACT_MAX) p->act_count += ACT_ADVANCE; TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); TAILQ_REMOVE(&object->memq, p, listq); TAILQ_INSERT_TAIL(&object->memq, p, listq); } } vm_page_unlock_queues(); p = next; } return dcount; } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ void vm_pageout_map_deactivate_pages(map, entry, count, freeer) vm_map_t map; vm_map_entry_t entry; int *count; int (*freeer)(vm_map_t, vm_object_t, int); { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (*count <= 0) return; vm_map_reference(map); if (!lock_try_read(&map->lock)) { vm_map_deallocate(map); return; } if (entry == 0) { tmpe = map->header.next; while (tmpe != &map->header && *count > 0) { vm_pageout_map_deactivate_pages(map, tmpe, count, freeer); tmpe = tmpe->next; }; } else if (entry->is_sub_map || entry->is_a_map) { tmpm = entry->object.share_map; tmpe = tmpm->header.next; while (tmpe != &tmpm->header && *count > 0) { vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer); tmpe = tmpe->next; }; } else if (obj = entry->object.vm_object) { *count -= (*freeer)(map, obj, *count); } lock_read_done(&map->lock); vm_map_deallocate(map); return; } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ int vm_pageout_scan() { vm_page_t m; int page_shortage, maxscan, maxlaunder; int pages_freed, free, nproc; int desired_free; vm_page_t next; struct proc *p; vm_object_t object; int s; int force_wakeup = 0; morefree: /* * scan the processes for exceeding their rlimits or if process * is swapped out -- deactivate pages */ rescanproc1: for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { vm_offset_t size; int overage; vm_offset_t limit; /* * if this is a system process or if we have already * looked at this process, skip it. */ if (p->p_flag & (P_SYSTEM|P_WEXIT)) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get a limit */ limit = min(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max); /* * let processes that are swapped out really be swapped out * set the limit to nothing (will force a swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; size = p->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG; if (size >= limit) { overage = (size - limit) / NBPG; vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages); } } if (((cnt.v_free_count + cnt.v_inactive_count) >= (cnt.v_inactive_target + cnt.v_free_target)) && (cnt.v_free_count >= cnt.v_free_target)) return force_wakeup; pages_freed = 0; desired_free = cnt.v_free_target; /* * Start scanning the inactive queue for pages we can free. * We keep scanning until we have enough free pages or * we have scanned through the entire queue. If we * encounter dirty pages, we start cleaning them. */ maxlaunder = (cnt.v_free_target - cnt.v_free_count); maxscan = cnt.v_inactive_count; rescan1: m = vm_page_queue_inactive.tqh_first; while (m && (maxscan-- > 0) && (cnt.v_free_count < desired_free) ) { vm_page_t next; next = m->pageq.tqe_next; if( (m->flags & PG_INACTIVE) == 0) { printf("vm_pageout_scan: page not inactive?"); continue; } /* * activate held pages */ if (m->hold_count != 0) { vm_page_activate(m); m = next; continue; } /* * dont mess with busy pages */ if (m->flags & PG_BUSY) { m = next; continue; } /* * if page is clean and but the page has been referenced, * then reactivate the page, but if we are very low on memory * or the page has not been referenced, then we free it to the * vm system. */ if (m->flags & PG_CLEAN) { if ((cnt.v_free_count > vm_pageout_free_min) /* XXX */ && pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { vm_page_activate(m); } else if (!m->act_count) { pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); ++pages_freed; } else { m->act_count -= min(m->act_count, ACT_DECLINE); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); } } else if ((m->flags & PG_LAUNDRY) && maxlaunder > 0) { int written; if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); m = next; continue; } /* * If a page is dirty, then it is either * being washed (but not yet cleaned) * or it is still in the laundry. If it is * still in the laundry, then we start the * cleaning operation. */ if (written = vm_pageout_clean(m,0)) { maxlaunder -= written; } /* * if the next page has been re-activated, start scanning again */ - if (next && (next->flags & PG_INACTIVE) == 0) + if (!next || (next->flags & PG_INACTIVE) == 0) goto rescan1; } else if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); } m = next; } /* * now check malloc area or swap processes out if we are in low * memory conditions */ if (cnt.v_free_count <= cnt.v_free_min) { /* * swap out inactive processes */ swapout_threads(); } /* * Compute the page shortage. If we are still very low on memory * be sure that we will move a minimal amount of pages from active * to inactive. */ page_shortage = cnt.v_inactive_target - (cnt.v_free_count + cnt.v_inactive_count); if (page_shortage <= 0) { if (pages_freed == 0) { if( cnt.v_free_count < cnt.v_free_min) { page_shortage = cnt.v_free_min - cnt.v_free_count; } else if(((cnt.v_free_count + cnt.v_inactive_count) < (cnt.v_free_min + cnt.v_inactive_target))) { page_shortage = 1; } else { page_shortage = 0; } } } maxscan = cnt.v_active_count; m = vm_page_queue_active.tqh_first; while (m && maxscan-- && (page_shortage > 0)) { next = m->pageq.tqe_next; /* * Don't deactivate pages that are busy. */ if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { m = next; continue; } if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); if (m->act_count < ACT_MAX) m->act_count += ACT_ADVANCE; TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); TAILQ_REMOVE(&m->object->memq, m, listq); TAILQ_INSERT_TAIL(&m->object->memq, m, listq); } else { m->act_count -= min(m->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we deactivate */ if (!m->act_count) { vm_page_deactivate(m); --page_shortage; /* * else if on the next go-around we will deactivate the page * we need to place the page on the end of the queue to age * the other pages in memory. */ } else { TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); TAILQ_REMOVE(&m->object->memq, m, listq); TAILQ_INSERT_TAIL(&m->object->memq, m, listq); } } m = next; } /* * if we have not freed any pages and we are desparate for memory * then we keep trying until we get some (any) memory. */ if( !force_wakeup && (swap_pager_full || !force_wakeup || (pages_freed == 0 && (cnt.v_free_count < cnt.v_free_min)))){ vm_pager_sync(); force_wakeup = 1; goto morefree; } vm_page_pagesfreed += pages_freed; return force_wakeup; } /* * vm_pageout is the high level pageout daemon. */ void vm_pageout() { extern npendingio, swiopend; static nowakeup; (void) spl0(); /* * Initialize some paging parameters. */ vmretry: cnt.v_free_min = 12; cnt.v_free_reserved = 8; if (cnt.v_free_min < 8) cnt.v_free_min = 8; if (cnt.v_free_min > 32) cnt.v_free_min = 32; vm_pageout_free_min = 4; cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; cnt.v_inactive_target = cnt.v_free_count / 12; cnt.v_free_min += cnt.v_free_reserved; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; (void) swap_pager_alloc(0, 0, 0, 0); /* * The pageout daemon is never done, so loop * forever. */ while (TRUE) { int force_wakeup; extern struct loadavg averunnable; /* cnt.v_free_min = 12 + averunnable.ldavg[0] / 1024; cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; cnt.v_inactive_target = cnt.v_free_target*2; */ tsleep((caddr_t) &vm_pages_needed, PVM, "psleep", 0); vm_pager_sync(); /* * The force wakeup hack added to eliminate delays and potiential * deadlock. It was possible for the page daemon to indefintely * postpone waking up a process that it might be waiting for memory * on. The putmulti stuff seems to have aggravated the situation. */ force_wakeup = vm_pageout_scan(); vm_pager_sync(); if( force_wakeup) wakeup( (caddr_t) &cnt.v_free_count); cnt.v_scan++; wakeup((caddr_t) kmem_map); } } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index b8e5a192796f..6d1c37287331 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1,1424 +1,1438 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993,1994 John S. Dyson * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.17 1994/04/05 03:23:53 davidg Exp $ + * $Id: vnode_pager.c,v 1.2 1994/05/25 09:21:11 rgrimes Exp $ */ /* * Page to/from files (vnodes). * * TODO: * pageouts * fix credential use (uses current process credentials now) */ /* * MODIFICATIONS: * John S. Dyson 08 Dec 93 * * This file in conjunction with some vm_fault mods, eliminate the performance * advantage for using the buffer cache and minimize memory copies. * * 1) Supports multiple - block reads * 2) Bypasses buffer cache for reads - * + * * TODO: * * 1) Totally bypass buffer cache for reads * (Currently will still sometimes use buffer cache for reads) * 2) Bypass buffer cache for writes * (Code does not support it, but mods are simple) */ #include #include #include #include #include #include #include #include #include #include #include #include -int vnode_pager_putmulti(); +int vnode_pager_putmulti(); -void vnode_pager_init(); -vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); -void vnode_pager_dealloc(); -int vnode_pager_getpage(); -int vnode_pager_getmulti(); -int vnode_pager_putpage(); -boolean_t vnode_pager_haspage(); +void vnode_pager_init(); +vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); +void vnode_pager_dealloc(); +int vnode_pager_getpage(); +int vnode_pager_getmulti(); +int vnode_pager_putpage(); +boolean_t vnode_pager_haspage(); struct pagerops vnodepagerops = { vnode_pager_init, vnode_pager_alloc, vnode_pager_dealloc, vnode_pager_getpage, vnode_pager_getmulti, vnode_pager_putpage, vnode_pager_putmulti, vnode_pager_haspage }; -static int vnode_pager_input(vn_pager_t vnp, vm_page_t *m, int count, int reqpage); -static int vnode_pager_output(vn_pager_t vnp, vm_page_t *m, int count, int *rtvals); -struct buf * getpbuf() ; -void relpbuf(struct buf *bp) ; +static int vnode_pager_input(vn_pager_t vnp, vm_page_t * m, int count, int reqpage); +static int vnode_pager_output(vn_pager_t vnp, vm_page_t * m, int count, int *rtvals); +struct buf * getpbuf(); +void relpbuf(struct buf * bp); extern vm_map_t pager_map; struct pagerlst vnode_pager_list; /* list of managed vnodes */ #define MAXBP (PAGE_SIZE/DEV_BSIZE); void vnode_pager_init() { TAILQ_INIT(&vnode_pager_list); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_pager_t vnode_pager_alloc(handle, size, prot, offset) caddr_t handle; vm_size_t size; vm_prot_t prot; vm_offset_t offset; { register vm_pager_t pager; register vn_pager_t vnp; vm_object_t object; struct vattr vattr; struct vnode *vp; struct proc *p = curproc; /* XXX */ /* * Pageout to vnode, no can do yet. */ if (handle == NULL) - return(NULL); + return (NULL); /* - * Vnodes keep a pointer to any associated pager so no need to - * lookup with vm_pager_lookup. + * Vnodes keep a pointer to any associated pager so no need to lookup + * with vm_pager_lookup. */ - vp = (struct vnode *)handle; - pager = (vm_pager_t)vp->v_vmdata; + vp = (struct vnode *) handle; + pager = (vm_pager_t) vp->v_vmdata; if (pager == NULL) { + /* * Allocate pager structures */ - pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, M_WAITOK); + pager = (vm_pager_t) malloc(sizeof *pager, M_VMPAGER, M_WAITOK); if (pager == NULL) - return(NULL); - vnp = (vn_pager_t)malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); + return (NULL); + vnp = (vn_pager_t) malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); if (vnp == NULL) { - free((caddr_t)pager, M_VMPAGER); - return(NULL); + free((caddr_t) pager, M_VMPAGER); + return (NULL); } + /* * And an object of the appropriate size */ if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) { object = vm_object_allocate(round_page(vattr.va_size)); vm_object_enter(object, pager); vm_object_setpager(object, pager, 0, TRUE); } else { - free((caddr_t)vnp, M_VMPGDATA); - free((caddr_t)pager, M_VMPAGER); - return(NULL); + free((caddr_t) vnp, M_VMPGDATA); + free((caddr_t) pager, M_VMPAGER); + return (NULL); } + /* * Hold a reference to the vnode and initialize pager data. */ VREF(vp); vnp->vnp_flags = 0; vnp->vnp_vp = vp; vnp->vnp_size = vattr.va_size; TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); pager->pg_handle = handle; pager->pg_type = PG_VNODE; pager->pg_ops = &vnodepagerops; - pager->pg_data = (caddr_t)vnp; - vp->v_vmdata = (caddr_t)pager; + pager->pg_data = (caddr_t) vnp; + vp->v_vmdata = (caddr_t) pager; } else { + /* - * vm_object_lookup() will remove the object from the - * cache if found and also gain a reference to the object. + * vm_object_lookup() will remove the object from the cache if + * found and also gain a reference to the object. */ object = vm_object_lookup(pager); } - return(pager); + return (pager); } void vnode_pager_dealloc(pager) vm_pager_t pager; { - register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + register vn_pager_t vnp = (vn_pager_t) pager->pg_data; register struct vnode *vp; - struct proc *p = curproc; /* XXX */ + struct proc *p = curproc; /* XXX */ if (vp = vnp->vnp_vp) { vp->v_vmdata = NULL; vp->v_flag &= ~VTEXT; #if 0 /* can hang if done at reboot on NFS FS */ (void) VOP_FSYNC(vp, p->p_ucred, p); #endif vrele(vp); } - TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); - free((caddr_t)vnp, M_VMPGDATA); - free((caddr_t)pager, M_VMPAGER); + free((caddr_t) vnp, M_VMPGDATA); + free((caddr_t) pager, M_VMPAGER); } int vnode_pager_getmulti(pager, m, count, reqpage, sync) vm_pager_t pager; vm_page_t *m; - int count; - int reqpage; + int count; + int reqpage; boolean_t sync; { - + return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage); } int vnode_pager_getpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { - int err; + int err; vm_page_t marray[1]; + if (pager == NULL) return FALSE; marray[0] = m; - return vnode_pager_input((vn_pager_t)pager->pg_data, marray, 1, 0); + return vnode_pager_input((vn_pager_t) pager->pg_data, marray, 1, 0); } boolean_t vnode_pager_putpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { - int err; + int err; vm_page_t marray[1]; - int rtvals[1]; + int rtvals[1]; if (pager == NULL) return FALSE; marray[0] = m; - vnode_pager_output((vn_pager_t)pager->pg_data, marray, 1, rtvals); + vnode_pager_output((vn_pager_t) pager->pg_data, marray, 1, rtvals); return rtvals[0]; } int vnode_pager_putmulti(pager, m, c, sync, rtvals) vm_pager_t pager; vm_page_t *m; - int c; + int c; boolean_t sync; - int *rtvals; + int *rtvals; { - return vnode_pager_output((vn_pager_t)pager->pg_data, m, c, rtvals); + return vnode_pager_output((vn_pager_t) pager->pg_data, m, c, rtvals); } boolean_t vnode_pager_haspage(pager, offset) vm_pager_t pager; vm_offset_t offset; { - register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + register vn_pager_t vnp = (vn_pager_t) pager->pg_data; daddr_t bn; - int run; - int err; + int err; /* * Offset beyond end of file, do not have the page */ if (offset >= vnp->vnp_size) { - return(FALSE); + return (FALSE); } /* - * Read the index to find the disk block to read - * from. If there is no block, report that we don't - * have this data. - * + * Read the index to find the disk block to read from. If there is no + * block, report that we don't have this data. + * * Assumes that the vnode has whole page or nothing. */ err = VOP_BMAP(vnp->vnp_vp, offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, - (struct vnode **)0, &bn, 0); + (struct vnode **) 0, &bn, 0); if (err) { - return(TRUE); + return (TRUE); } - return((long)bn < 0 ? FALSE : TRUE); + return ((long) bn < 0 ? FALSE : TRUE); } /* * Lets the VM system know about a change in size for a file. * If this vnode is mapped into some address space (i.e. we have a pager * for it) we adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(vp, nsize) struct vnode *vp; - u_long nsize; + u_long nsize; { register vn_pager_t vnp; register vm_object_t object; vm_pager_t pager; /* * Not a mapped vnode */ if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL) return; + /* * Hasn't changed size */ - pager = (vm_pager_t)vp->v_vmdata; - vnp = (vn_pager_t)pager->pg_data; + pager = (vm_pager_t) vp->v_vmdata; + vnp = (vn_pager_t) pager->pg_data; if (nsize == vnp->vnp_size) return; + /* - * No object. - * This can happen during object termination since - * vm_object_page_clean is called after the object - * has been removed from the hash table, and clean - * may cause vnode write operations which can wind - * up back here. + * No object. This can happen during object termination since + * vm_object_page_clean is called after the object has been removed + * from the hash table, and clean may cause vnode write operations + * which can wind up back here. */ object = vm_object_lookup(pager); if (object == NULL) return; /* - * File has shrunk. - * Toss any cached pages beyond the new EOF. + * File has shrunk. Toss any cached pages beyond the new EOF. */ - if (round_page(nsize) < round_page(vnp->vnp_size)) { + if (nsize < vnp->vnp_size) { vm_object_lock(object); vm_object_page_remove(object, - (vm_offset_t)round_page(nsize), round_page(vnp->vnp_size)); + round_page((vm_offset_t) nsize), vnp->vnp_size); vm_object_unlock(object); + + /* + * this gets rid of garbage at the end of a page that is now + * only partially backed by the vnode... + */ + if (nsize & PAGE_MASK) { + vm_offset_t kva; + vm_page_t m; + + m = vm_page_lookup(object, trunc_page((vm_offset_t) nsize)); + if (m) { + kva = vm_pager_map_page(m); + bzero((caddr_t) kva + (nsize & PAGE_MASK), + round_page(nsize) - nsize); + vm_pager_unmap_page(kva); + } + } + } else { + + /* + * this allows the filesystem and VM cache to stay in sync if + * the VM page hasn't been modified... After the page is + * removed -- it will be faulted back in from the filesystem + * cache. + */ + if (vnp->vnp_size & PAGE_MASK) { + vm_page_t m; + + m = vm_page_lookup(object, trunc_page(vnp->vnp_size)); + if (m && (m->flags & PG_CLEAN)) { + vm_object_lock(object); + vm_object_page_remove(object, + vnp->vnp_size, vnp->vnp_size); + vm_object_unlock(object); + } + } } - vnp->vnp_size = (vm_offset_t)nsize; + vnp->vnp_size = (vm_offset_t) nsize; + object->size = round_page(nsize); + vm_object_deallocate(object); } void vnode_pager_umount(mp) register struct mount *mp; { register vm_pager_t pager, npager; struct vnode *vp; pager = vnode_pager_list.tqh_first; - while( pager) { + while (pager) { + /* - * Save the next pointer now since uncaching may - * terminate the object and render pager invalid + * Save the next pointer now since uncaching may terminate the + * object and render pager invalid */ - vp = ((vn_pager_t)pager->pg_data)->vnp_vp; + vp = ((vn_pager_t) pager->pg_data)->vnp_vp; npager = pager->pg_list.tqe_next; - if (mp == (struct mount *)0 || vp->v_mount == mp) + if (mp == (struct mount *) 0 || vp->v_mount == mp) (void) vnode_pager_uncache(vp); pager = npager; } } /* * Remove vnode associated object from the object cache. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ boolean_t vnode_pager_uncache(vp) register struct vnode *vp; { register vm_object_t object; boolean_t uncached, locked; vm_pager_t pager; /* * Not a mapped vnode */ - pager = (vm_pager_t)vp->v_vmdata; + pager = (vm_pager_t) vp->v_vmdata; if (pager == NULL) return (TRUE); + /* - * Unlock the vnode if it is currently locked. - * We do this since uncaching the object may result - * in its destruction which may initiate paging - * activity which may necessitate locking the vnode. + * Unlock the vnode if it is currently locked. We do this since + * uncaching the object may result in its destruction which may + * initiate paging activity which may necessitate locking the vnode. */ locked = VOP_ISLOCKED(vp); if (locked) VOP_UNLOCK(vp); + /* - * Must use vm_object_lookup() as it actually removes - * the object from the cache list. + * Must use vm_object_lookup() as it actually removes the object from + * the cache list. */ object = vm_object_lookup(pager); if (object) { uncached = (object->ref_count <= 1); pager_cache(object, FALSE); } else uncached = TRUE; if (locked) VOP_LOCK(vp); - return(uncached); + return (uncached); } -#if 0 -/* - * Remove vnode associated object from the object cache. - * - * XXX unlock the vnode if it is currently locked. - * We must do this since uncaching the object may result in its - * destruction which may initiate paging activity which may necessitate - * re-locking the vnode. - */ -boolean_t -vnode_pager_uncache(vp) - register struct vnode *vp; -{ - register vm_object_t object; - boolean_t uncached; - vm_pager_t pager; - - /* - * Not a mapped vnode - */ - pager = (vm_pager_t)vp->v_vmdata; - if (pager == NULL) - return (TRUE); - /* - * Must use vm_object_lookup() as it actually removes - * the object from the cache list. - */ - object = vm_object_lookup(pager); - if (object) { - uncached = (object->ref_count <= 1); - VOP_UNLOCK(vp); - pager_cache(object, FALSE); - VOP_LOCK(vp); - } else - uncached = TRUE; - return(uncached); -} -#endif void vnode_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * calculate the linear (byte) disk address of specified virtual * file address */ vm_offset_t vnode_pager_addr(vp, address) struct vnode *vp; vm_offset_t address; { - int rtaddress; - int bsize; + int rtaddress; + int bsize; vm_offset_t block; struct vnode *rtvp; - int err; - int vblock, voffset; - int run; + int err; + int vblock, voffset; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; - err = VOP_BMAP(vp,vblock,&rtvp,&block,0); + err = VOP_BMAP(vp, vblock, &rtvp, &block, 0); - if( err) + if (err) rtaddress = -1; else rtaddress = block * DEV_BSIZE + voffset; return rtaddress; } /* * interrupt routine for I/O completion */ void vnode_pager_iodone(bp) struct buf *bp; { bp->b_flags |= B_DONE; - wakeup((caddr_t)bp); + wakeup((caddr_t) bp); } /* * small block file system vnode pager input */ int vnode_pager_input_smlfs(vnp, m) vn_pager_t vnp; vm_page_t m; { - int i; - int s; + int i; + int s; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t foff; vm_offset_t kva; - int fileaddr; - int block; + int fileaddr; + int block; vm_offset_t bsize; - int error = 0; - int run; + int error = 0; paging_offset = m->object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; foff = m->offset + paging_offset; VOP_BMAP(vp, foff, &dp, 0, 0); kva = vm_pager_map_page(m); - for(i=0;ib_flags & B_BUSY) { bp->b_flags |= B_WANTED; - tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + tsleep((caddr_t) bp, PVM, "vnwblk", 0); continue; } - amount = bsize; if ((foff + bsize) > vnp->vnp_size) amount = vnp->vnp_size - foff; - /* - * make sure that this page is in the buffer - */ + /* + * make sure that this page is in the buffer + */ if ((amount > 0) && amount <= bp->b_bcount) { bp->b_flags |= B_BUSY; splx(s); - /* - * copy the data from the buffer - */ - bcopy(bp->b_un.b_addr, (caddr_t)kva + i * bsize, amount); + /* + * copy the data from the buffer + */ + bcopy(bp->b_un.b_addr, (caddr_t) kva + i * bsize, amount); if (amount < bsize) { - bzero((caddr_t)kva + amount, bsize - amount); + bzero((caddr_t) kva + amount, bsize - amount); } bp->b_flags &= ~B_BUSY; - wakeup((caddr_t)bp); + wakeup((caddr_t) bp); goto nextblock; } break; } splx(s); fileaddr = vnode_pager_addr(vp, foff + i * bsize); - if( fileaddr != -1) { + if (fileaddr != -1) { bp = getpbuf(); VHOLD(vp); - /* build a minimal buffer header */ + /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - if( bp->b_rcred != NOCRED) + if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); - if( bp->b_wcred != NOCRED) + if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr / DEV_BSIZE; bgetvp(dp, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; - - /* do the input */ + + /* do the input */ VOP_STRATEGY(bp); - /* we definitely need to be at splbio here */ + /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { - tsleep((caddr_t)bp, PVM, "vnsrd", 0); + tsleep((caddr_t) bp, PVM, "vnsrd", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; - /* - * free the buffer header back to the swap buffer pool - */ + /* + * free the buffer header back to the swap buffer pool + */ relpbuf(bp); HOLDRELE(vp); - if( error) + if (error) break; } else { bzero((caddr_t) kva + i * bsize, bsize); } nextblock: } vm_pager_unmap_page(kva); - if( error) { + if (error) { return VM_PAGER_FAIL; } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags |= PG_CLEAN; m->flags &= ~PG_LAUNDRY; return VM_PAGER_OK; } /* * old style vnode pager output routine */ int vnode_pager_input_old(vnp, m) vn_pager_t vnp; vm_page_t m; { - int i; + int i; struct uio auio; struct iovec aiov; - int error; - int size; + int error; + int size; vm_offset_t foff; vm_offset_t kva; error = 0; foff = m->offset + m->object->paging_offset; + /* * Return failure if beyond current EOF */ if (foff >= vnp->vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (foff + size > vnp->vnp_size) size = vnp->vnp_size - foff; /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ kva = vm_pager_map_page(m); - aiov.iov_base = (caddr_t)kva; + aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = foff; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; - auio.uio_procp = (struct proc *)0; + auio.uio_procp = (struct proc *) 0; error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred); if (!error) { register int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) - bzero((caddr_t)kva + count, PAGE_SIZE - count); + bzero((caddr_t) kva + count, PAGE_SIZE - count); } vm_pager_unmap_page(kva); } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags |= PG_CLEAN; m->flags &= ~PG_LAUNDRY; - return error?VM_PAGER_FAIL:VM_PAGER_OK; + return error ? VM_PAGER_FAIL : VM_PAGER_OK; } /* * generic vnode pager input routine */ int vnode_pager_input(vnp, m, count, reqpage) register vn_pager_t vnp; vm_page_t *m; - int count, reqpage; + int count, reqpage; { - int i,j; + int i, j; vm_offset_t kva, foff; - int size; - struct proc *p = curproc; /* XXX */ + int size; + struct proc *p = curproc; /* XXX */ vm_object_t object; vm_offset_t paging_offset; struct vnode *dp, *vp; vm_offset_t mapsize; - int bsize; + int bsize; - int first, last; - int reqaddr, firstaddr; - int run; - int block, offset; + int first, last; + int reqaddr, firstaddr; + int block, offset; - int nbp; + int nbp; struct buf *bp; - int s; - int failflag; + int s; + int failflag; - int errtype=0; /* 0 is file type otherwise vm type */ - int error = 0; + int errtype = 0; /* 0 is file type otherwise vm type */ + int error = 0; - object = m[reqpage]->object; /* all vm_page_t items are in same object */ + object = m[reqpage]->object; /* all vm_page_t items are in same + * object */ paging_offset = object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; /* get the UNDERLYING device for the file with VOP_BMAP() */ + /* - * originally, we did not check for an error return - * value -- assuming an fs always has a bmap entry point - * -- that assumption is wrong!!! + * originally, we did not check for an error return value -- assuming + * an fs always has a bmap entry point -- that assumption is wrong!!! */ kva = 0; mapsize = 0; foff = m[reqpage]->offset + paging_offset; if (!VOP_BMAP(vp, foff, &dp, 0, 0)) { + /* * we do not block for a kva, notice we default to a kva * conservative behavior */ - kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); - if( !kva) { + kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE)); + if (!kva) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } m[0] = m[reqpage]; kva = kmem_alloc_wait(pager_map, mapsize = PAGE_SIZE); reqpage = 0; count = 1; } } /* * if we can't get a kva or we can't bmap, use old VOP code */ if (!kva) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } return vnode_pager_input_old(vnp, m[reqpage]); - /* - * if the blocksize is smaller than a page size, then use - * special small filesystem code. NFS sometimes has a small - * blocksize, but it can handle large reads itself. - */ - } else if( (PAGE_SIZE / bsize) > 1 && - (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + + /* + * if the blocksize is smaller than a page size, then use + * special small filesystem code. NFS sometimes has a small + * blocksize, but it can handle large reads itself. + */ + } else if ((PAGE_SIZE / bsize) > 1 && + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { kmem_free_wakeup(pager_map, kva, mapsize); for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } return vnode_pager_input_smlfs(vnp, m[reqpage]); } - /* * here on direct device I/O */ /* * This pathetic hack gets data from the buffer cache, if it's there. - * I believe that this is not really necessary, and the ends can - * be gotten by defaulting to the normal vfs read behavior, but this + * I believe that this is not really necessary, and the ends can be + * gotten by defaulting to the normal vfs read behavior, but this * might be more efficient, because the will NOT invoke read-aheads - * and one of the purposes of this code is to bypass the buffer - * cache and keep from flushing it by reading in a program. + * and one of the purposes of this code is to bypass the buffer cache + * and keep from flushing it by reading in a program. */ + /* * calculate logical block and offset */ block = foff / bsize; offset = foff % bsize; s = splbio(); /* * if we have a buffer in core, then try to use it */ while (bp = incore(vp, block)) { - int amount; + int amount; - /* - * wait until the buffer is avail or gone - */ + /* + * wait until the buffer is avail or gone + */ if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; - tsleep ((caddr_t)bp, PVM, "vnwblk", 0); + tsleep((caddr_t) bp, PVM, "vnwblk", 0); continue; } - amount = PAGE_SIZE; if ((foff + amount) > vnp->vnp_size) amount = vnp->vnp_size - foff; /* * make sure that this page is in the buffer */ if ((amount > 0) && (offset + amount) <= bp->b_bcount) { bp->b_flags |= B_BUSY; splx(s); /* * map the requested page */ pmap_kenter(kva, VM_PAGE_TO_PHYS(m[reqpage])); pmap_update(); /* * copy the data from the buffer */ - bcopy(bp->b_un.b_addr + offset, (caddr_t)kva, amount); + bcopy(bp->b_un.b_addr + offset, (caddr_t) kva, amount); if (amount < PAGE_SIZE) { - bzero((caddr_t)kva + amount, PAGE_SIZE - amount); + bzero((caddr_t) kva + amount, PAGE_SIZE - amount); } + /* * unmap the page and free the kva */ pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE); kmem_free_wakeup(pager_map, kva, mapsize); + /* * release the buffer back to the block subsystem */ bp->b_flags &= ~B_BUSY; - wakeup((caddr_t)bp); + wakeup((caddr_t) bp); + /* * we did not have to do any work to get the requested * page, the read behind/ahead does not justify a read */ for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } count = 1; reqpage = 0; m[0] = m[reqpage]; /* * sorry for the goto */ goto finishup; } + /* * buffer is nowhere to be found, read from the disk */ break; } splx(s); reqaddr = vnode_pager_addr(vp, foff); s = splbio(); + /* - * Make sure that our I/O request is contiguous. - * Scan backward and stop for the first discontiguous - * entry or stop for a page being in buffer cache. + * Make sure that our I/O request is contiguous. Scan backward and + * stop for the first discontiguous entry or stop for a page being in + * buffer cache. */ failflag = 0; first = reqpage; for (i = reqpage - 1; i >= 0; --i) { if (failflag || - incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || - (vnode_pager_addr(vp, m[i]->offset + paging_offset)) - != reqaddr + (i - reqpage) * PAGE_SIZE) { + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { vnode_pager_freepage(m[i]); failflag = 1; } else { first = i; } } /* - * Scan forward and stop for the first non-contiguous - * entry or stop for a page being in buffer cache. + * Scan forward and stop for the first non-contiguous entry or stop + * for a page being in buffer cache. */ failflag = 0; last = reqpage + 1; for (i = reqpage + 1; i < count; i++) { if (failflag || - incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || - (vnode_pager_addr(vp, m[i]->offset + paging_offset)) - != reqaddr + (i - reqpage) * PAGE_SIZE) { + incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || + (vnode_pager_addr(vp, m[i]->offset + paging_offset)) + != reqaddr + (i - reqpage) * PAGE_SIZE) { vnode_pager_freepage(m[i]); failflag = 1; } else { last = i + 1; } } splx(s); /* - * the first and last page have been calculated now, move input - * pages to be zero based... + * the first and last page have been calculated now, move input pages + * to be zero based... */ count = last; if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; } count -= first; reqpage -= first; } /* * calculate the file virtual address for the transfer */ foff = m[0]->offset + paging_offset; + /* * and get the disk physical address (in bytes) */ firstaddr = vnode_pager_addr(vp, foff); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > vnp->vnp_size) size = vnp->vnp_size - foff; /* * round up physical size for real devices */ - if( dp->v_type == VBLK || dp->v_type == VCHR) + if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); /* * and map the pages to be read into the kva */ for (i = 0; i < count; i++) - pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); pmap_update(); bp = getpbuf(); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - if( bp->b_rcred != NOCRED) + if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); - if( bp->b_wcred != NOCRED) + if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = firstaddr / DEV_BSIZE; bgetvp(dp, bp); bp->b_bcount = size; bp->b_bufsize = size; /* do the input */ VOP_STRATEGY(bp); s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { - tsleep((caddr_t)bp, PVM, "vnread", 0); + tsleep((caddr_t) bp, PVM, "vnread", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; if (!error) { if (size != count * PAGE_SIZE) - bzero((caddr_t)kva + size, PAGE_SIZE * count - size); + bzero((caddr_t) kva + size, PAGE_SIZE * count - size); } - pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); kmem_free_wakeup(pager_map, kva, mapsize); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); finishup: for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->flags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; if (i != reqpage) { + /* - * whether or not to leave the page activated - * is up in the air, but we should put the page - * on a page queue somewhere. (it already is in - * the object). - * Result: It appears that emperical results show - * that deactivating pages is best. + * whether or not to leave the page activated is up in + * the air, but we should put the page on a page queue + * somewhere. (it already is in the object). Result: + * It appears that emperical results show that + * deactivating pages is best. */ + /* - * just in case someone was asking for this - * page we now tell them that it is ok to use + * just in case someone was asking for this page we + * now tell them that it is ok to use */ if (!error) { vm_page_deactivate(m[i]); PAGE_WAKEUP(m[i]); m[i]->flags &= ~PG_FAKE; - m[i]->act_count = 2; } else { vnode_pager_freepage(m[i]); } } } if (error) { printf("vnode pager read error: %d\n", error); } if (errtype) return error; return (error ? VM_PAGER_FAIL : VM_PAGER_OK); } /* * old-style vnode pager output routine */ int vnode_pager_output_old(vnp, m) register vn_pager_t vnp; vm_page_t m; { vm_offset_t foff; vm_offset_t kva; vm_offset_t size; struct iovec aiov; struct uio auio; struct vnode *vp; - int error; + int error; vp = vnp->vnp_vp; foff = m->offset + m->object->paging_offset; + /* * Return failure if beyond current EOF */ if (foff >= vnp->vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (foff + size > vnp->vnp_size) size = vnp->vnp_size - foff; /* * Allocate a kernel virtual address and initialize so that * we can use VOP_WRITE routines. */ kva = vm_pager_map_page(m); - aiov.iov_base = (caddr_t)kva; + aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = foff; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_resid = size; - auio.uio_procp = (struct proc *)0; + auio.uio_procp = (struct proc *) 0; error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred); if (!error) { if ((size - auio.uio_resid) == 0) { error = EINVAL; } } vm_pager_unmap_page(kva); - return error?VM_PAGER_FAIL:VM_PAGER_OK; + return error ? VM_PAGER_FAIL : VM_PAGER_OK; } } /* * vnode pager output on a small-block file system */ int vnode_pager_output_smlfs(vnp, m) vn_pager_t vnp; vm_page_t m; { - int i; - int s; + int i; + int s; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t foff; vm_offset_t kva; - int fileaddr; - int block; + int fileaddr; + int block; vm_offset_t bsize; - int run; - int error = 0; + int error = 0; paging_offset = m->object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; foff = m->offset + paging_offset; VOP_BMAP(vp, foff, &dp, 0, 0); kva = vm_pager_map_page(m); - for(i = 0; !error && i < (PAGE_SIZE/bsize); i++) { - /* - * calculate logical block and offset - */ + for (i = 0; !error && i < (PAGE_SIZE / bsize); i++) { + + /* + * calculate logical block and offset + */ fileaddr = vnode_pager_addr(vp, foff + i * bsize); - if( fileaddr != -1) { + if (fileaddr != -1) { s = splbio(); - if( bp = incore( vp, (foff/bsize) + i)) { - bp = getblk(vp, (foff/bsize) + i, bp->b_bufsize,0, 0); + if (bp = incore(vp, (foff / bsize) + i)) { + bp = getblk(vp, (foff / bsize) + i, bp->b_bufsize, 0, 0); bp->b_flags |= B_INVAL; brelse(bp); } splx(s); bp = getpbuf(); VHOLD(vp); - /* build a minimal buffer header */ + /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_CALL | B_WRITE; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - if( bp->b_rcred != NOCRED) + if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); - if( bp->b_wcred != NOCRED) + if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr / DEV_BSIZE; bgetvp(dp, bp); ++dp->v_numoutput; - /* for NFS */ + /* for NFS */ bp->b_dirtyoff = 0; bp->b_dirtyend = bsize; bp->b_bcount = bsize; bp->b_bufsize = bsize; - - /* do the input */ + + /* do the input */ VOP_STRATEGY(bp); - /* we definitely need to be at splbio here */ + /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { - tsleep((caddr_t)bp, PVM, "vnswrt", 0); + tsleep((caddr_t) bp, PVM, "vnswrt", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; - /* - * free the buffer header back to the swap buffer pool - */ + /* + * free the buffer header back to the swap buffer pool + */ relpbuf(bp); HOLDRELE(vp); - } + } } vm_pager_unmap_page(kva); - if( error) + if (error) return VM_PAGER_FAIL; else return VM_PAGER_OK; } /* * generic vnode pager output routine */ int vnode_pager_output(vnp, m, count, rtvals) vn_pager_t vnp; vm_page_t *m; - int count; - int *rtvals; + int count; + int *rtvals; { - int i,j; + int i, j; vm_offset_t kva, foff; - int size; - struct proc *p = curproc; /* XXX */ + int size; + struct proc *p = curproc; /* XXX */ vm_object_t object; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t reqaddr; - int run; - int bsize; - int s; + int bsize; + int s; - int error = 0; + int error = 0; retryoutput: object = m[0]->object; /* all vm_page_t items are in same object */ paging_offset = object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; - for(i=0;ioffset+paging_offset, &dp, 0, 0)) { + if (VOP_BMAP(vp, m[0]->offset + paging_offset, &dp, 0, 0)) { rtvals[0] = vnode_pager_output_old(vnp, m[0]); pmap_clear_modify(VM_PAGE_TO_PHYS(m[0])); m[0]->flags |= PG_CLEAN; m[0]->flags &= ~PG_LAUNDRY; return rtvals[0]; } /* - * if the filesystem has a small blocksize, then use - * the small block filesystem output code + * if the filesystem has a small blocksize, then use the small block + * filesystem output code */ if ((bsize < PAGE_SIZE) && - (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { + (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { - for(i=0;iflags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; } } return rtvals[0]; } /* * get some kva for the output */ - kva = kmem_alloc_pageable(pager_map, (mapsize = count*PAGE_SIZE)); - if( !kva) { + kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE)); + if (!kva) { kva = kmem_alloc_pageable(pager_map, (mapsize = PAGE_SIZE)); count = 1; - if( !kva) + if (!kva) return rtvals[0]; } - - for(i=0;ioffset + paging_offset; if (foff >= vnp->vnp_size) { - for(j=i;joffset + paging_offset; reqaddr = vnode_pager_addr(vp, foff); + /* - * Scan forward and stop for the first non-contiguous - * entry or stop for a page being in buffer cache. + * Scan forward and stop for the first non-contiguous entry or stop + * for a page being in buffer cache. */ for (i = 1; i < count; i++) { - if ( vnode_pager_addr(vp, m[i]->offset + paging_offset) - != reqaddr + i * PAGE_SIZE) { + if (vnode_pager_addr(vp, m[i]->offset + paging_offset) + != reqaddr + i * PAGE_SIZE) { count = i; break; } } /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > vnp->vnp_size) size = vnp->vnp_size - foff; /* * round up physical size for real devices */ - if( dp->v_type == VBLK || dp->v_type == VCHR) + if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); /* * and map the pages to be read into the kva */ for (i = 0; i < count; i++) - pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); + pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); pmap_update(); /* printf("vnode: writing foff: %d, devoff: %d, size: %d\n", foff, reqaddr, size); */ + /* * next invalidate the incore vfs_bio data */ for (i = 0; i < count; i++) { - int filblock = (foff + i * PAGE_SIZE) / bsize; + int filblock = (foff + i * PAGE_SIZE) / bsize; struct buf *fbp; s = splbio(); - if( fbp = incore( vp, filblock)) { - /* printf("invalidating: %d\n", filblock); */ - fbp = getblk(vp, filblock, fbp->b_bufsize,0,0); + if (fbp = incore(vp, filblock)) { + fbp = getblk(vp, filblock, fbp->b_bufsize, 0, 0); + if (fbp->b_flags & B_DELWRI) { + if (fbp->b_bufsize <= PAGE_SIZE) + fbp->b_flags &= ~B_DELWRI; + else { + bwrite(fbp); + fbp = getblk(vp, filblock, + fbp->b_bufsize, 0, 0); + } + } fbp->b_flags |= B_INVAL; brelse(fbp); } splx(s); } bp = getpbuf(); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_WRITE | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; - if( bp->b_rcred != NOCRED) + if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); - if( bp->b_wcred != NOCRED) + if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr / DEV_BSIZE; bgetvp(dp, bp); ++dp->v_numoutput; - + /* for NFS */ bp->b_dirtyoff = 0; bp->b_dirtyend = size; bp->b_bcount = size; bp->b_bufsize = size; /* do the output */ VOP_STRATEGY(bp); s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { - tsleep((caddr_t)bp, PVM, "vnwrite", 0); + tsleep((caddr_t) bp, PVM, "vnwrite", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); kmem_free_wakeup(pager_map, kva, mapsize); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); - if( !error) { - for(i=0;iflags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; rtvals[i] = VM_PAGER_OK; } - } else if( count != 1) { + } else if (count != 1) { error = 0; count = 1; goto retryoutput; } - if (error) { printf("vnode pager write error: %d\n", error); } return (error ? VM_PAGER_FAIL : VM_PAGER_OK); } -