Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 1886) +++ head/sys/amd64/amd64/machdep.c (revision 1887) @@ -1,1532 +1,1535 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.45 1994/08/03 02:45:26 davidg Exp $ + * $Id: machdep.c,v 1.46 1994/08/04 06:10:27 davidg Exp $ */ #include "npx.h" #include "isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include "sys/shm.h" #endif #ifdef SYSVMSG #include "msg.h" #endif #ifdef SYSVSEM #include "sem.h" #endif #include "vm/vm.h" #include "vm/vm_kern.h" #include "vm/vm_page.h" #include "sys/exec.h" #include "sys/vnode.h" extern vm_offset_t avail_start, avail_end; #include "machine/cpu.h" #include "machine/reg.h" #include "machine/psl.h" #include "machine/specialreg.h" #include "machine/sysarch.h" #include "machine/cons.h" #include "i386/isa/isa.h" #include "i386/isa/rtc.h" static void identifycpu(void); static void initcpu(void); static int test_page(int *, int); extern int grow(struct proc *,u_int); const char machine[] = "PC-Class"; const char *cpu_model; #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif /* * Declare these as initialized data so we can patch them. */ int nswbuf = 0; #ifdef NBUF int nbuf = NBUF; #else int nbuf = 0; #endif #ifdef BUFPAGES int bufpages = BUFPAGES; #else int bufpages = 0; #endif #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif int msgbufmapped = 0; /* set when safe to use msgbuf */ extern int freebufspace; extern char *bouncememory; int _udatasel, _ucodesel; /* * Machine-dependent startup code */ int boothowto = 0, Maxmem = 0, badpages = 0, physmem = 0; long dumplo; extern int bootdev; int biosmem; vm_offset_t phys_avail[6]; extern cyloffset; int cpu_class; void dumpsys __P((void)); vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; vm_offset_t pager_sva, pager_eva; int maxbkva, pager_map_size; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) void cpu_startup() { register int unixsize; register unsigned i; register struct pte *pte; int mapaddr, j; register caddr_t v; int maxbufs, base, residual; extern long Usrptsize; vm_offset_t minaddr, maxaddr; vm_size_t size = 0; int firstaddr; /* * Initialize error message buffer (at end of core). */ /* avail_end was pre-decremented in init_386() to compensate */ for (i = 0; i < btoc(sizeof (struct msgbuf)); i++) pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp, avail_end + i * NBPG, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* * Good {morning,afternoon,evening,night}. */ printf(version); identifycpu(); printf("real memory = %d (%d pages)\n", ptoa(physmem), physmem); if (badpages) printf("bad memory = %d (%d pages)\n", ptoa(badpages), badpages); /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif /* * Determine how many buffers to allocate. * Use 20% of memory of memory beyond the first 2MB * Insure a minimum of 16 fs buffers. * We allocate 1/2 as many swap buffer headers as file i/o buffers. */ if (bufpages == 0) bufpages = ((physmem << PGSHIFT) - 2048*1024) / NBPG / 5; if (bufpages < 64) bufpages = 64; /* * We must still limit the maximum number of buffers to be no * more than 2/5's of the size of the kernal malloc region, this * will only take effect for machines with lots of memory */ bufpages = min(bufpages, (VM_KMEM_SIZE / NBPG) * 2 / 5); if (nbuf == 0) { nbuf = bufpages / 2; if (nbuf < 32) nbuf = 32; } freebufspace = bufpages * NBPG; if (nswbuf == 0) { nswbuf = (nbuf / 2) &~ 1; /* force even */ - if (nswbuf > 256) - nswbuf = 256; /* sanity */ + if (nswbuf > 64) + nswbuf = 64; /* sanity */ } valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifndef NOBOUNCE /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) bouncepages = 96; /* largest physio size + extra */ v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, - (nbuf*MAXBSIZE) + VM_PHYS_SIZE + maxbkva + pager_map_size, TRUE); - - io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); + (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + + maxbkva + pager_map_size, TRUE); + buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, + (nbuf*MAXBSIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, - pager_map_size, TRUE); + (nswbuf*MAXPHYS) + pager_map_size, TRUE); + io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); - buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, - (nbuf * MAXBSIZE), TRUE); +#if 0 /* * Allocate a submap for physio */ phys_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, VM_PHYS_SIZE, TRUE); +#endif /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ mclrefcnt = (char *)malloc(NMBCLUSTERS+CLBYTES/MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, NMBCLUSTERS+CLBYTES/MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, VM_MBUF_SIZE, FALSE); /* * Initialize callouts */ callfree = callout; for (i = 1; i < ncallout; i++) callout[i-1].c_next = &callout[i]; printf("avail memory = %d (%d pages)\n", ptoa(cnt.v_free_count), cnt.v_free_count); printf("using %d buffers containing %d bytes of memory\n", nbuf, bufpages * CLBYTES); #ifndef NOBOUNCE /* * init bounce buffers */ vm_bounce_init(); #endif /* * Set up CPU-specific registers, cache, etc. */ initcpu(); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); + vm_pager_bufferinit(); /* * Configure the system. */ configure(); } struct cpu_nameclass i386_cpus[] = { { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ { "i486SX", CPUCLASS_486 }, /* CPU_486SX */ { "i486DX", CPUCLASS_486 }, /* CPU_486 */ { "i586", CPUCLASS_586 }, /* CPU_586 */ }; static void identifycpu() { printf("CPU: "); if (cpu >= 0 && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) { printf("%s", i386_cpus[cpu].cpu_name); cpu_class = i386_cpus[cpu].cpu_class; cpu_model = i386_cpus[cpu].cpu_name; } else { printf("unknown cpu type %d\n", cpu); panic("startup: bad cpu id"); } printf(" ("); switch(cpu_class) { case CPUCLASS_286: printf("286"); break; case CPUCLASS_386: printf("386"); break; case CPUCLASS_486: printf("486"); break; case CPUCLASS_586: printf("586"); break; default: printf("unknown"); /* will panic below... */ } printf("-class CPU)"); printf("\n"); /* cpu speed would be nice, but how? */ /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { case CPUCLASS_286: /* a 286 should not make it this far, anyway */ #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) #error This kernel is not configured for one of the supported CPUs #endif #if !defined(I386_CPU) case CPUCLASS_386: #endif #if !defined(I486_CPU) case CPUCLASS_486: #endif #if !defined(I586_CPU) case CPUCLASS_586: #endif panic("CPU class not configured"); default: break; } } #ifdef PGINPROF /* * Return the difference (in microseconds) * between the current time and a previous * time as represented by the arguments. * If there is a pending clock interrupt * which has not been serviced due to high * ipl, return error code. */ /*ARGSUSED*/ vmtime(otime, olbolt, oicr) register int otime, olbolt, oicr; { return (((time.tv_sec-otime)*60 + lbolt-olbolt)*16667); } #endif extern int kstack[]; /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; unsigned code; { register struct proc *p = curproc; register int *regs; register struct sigframe *fp; struct sigacts *psp = p->p_sigacts; int oonstack, frmtrap; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((psp->ps_flags & SAS_ALTSTACK) && (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_base + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SA_ONSTACK; } else { fp = (struct sigframe *)(regs[tESP] - sizeof(struct sigframe)); } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ fp->sf_signum = sig; fp->sf_code = code; fp->sf_scp = &fp->sf_sc; fp->sf_addr = (char *) regs[tERR]; fp->sf_handler = catcher; /* save scratch registers */ fp->sf_sc.sc_eax = regs[tEAX]; fp->sf_sc.sc_ebx = regs[tEBX]; fp->sf_sc.sc_ecx = regs[tECX]; fp->sf_sc.sc_edx = regs[tEDX]; fp->sf_sc.sc_esi = regs[tESI]; fp->sf_sc.sc_edi = regs[tEDI]; fp->sf_sc.sc_cs = regs[tCS]; fp->sf_sc.sc_ds = regs[tDS]; fp->sf_sc.sc_ss = regs[tSS]; fp->sf_sc.sc_es = regs[tES]; fp->sf_sc.sc_isp = regs[tISP]; /* * Build the signal context to be used by sigreturn. */ fp->sf_sc.sc_onstack = oonstack; fp->sf_sc.sc_mask = mask; fp->sf_sc.sc_sp = regs[tESP]; fp->sf_sc.sc_fp = regs[tEBP]; fp->sf_sc.sc_pc = regs[tEIP]; fp->sf_sc.sc_ps = regs[tEFLAGS]; regs[tESP] = (int)fp; regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc; regs[tEFLAGS] &= ~PSL_VM; regs[tCS] = _ucodesel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tSS] = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ struct sigreturn_args { struct sigcontext *sigcntxp; }; int sigreturn(p, uap, retval) struct proc *p; struct sigreturn_args *uap; int *retval; { register struct sigcontext *scp; register struct sigframe *fp; register int *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs[tESP] points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0) return(EINVAL); eflags = scp->sc_ps; if ((eflags & PSL_USERCLR) != 0 || (eflags & PSL_USERSET) != PSL_USERSET || (eflags & PSL_IOPL) < (regs[tEFLAGS] & PSL_IOPL)) { #ifdef DEBUG printf("sigreturn: eflags=0x%x\n", eflags); #endif return(EINVAL); } /* * Sanity check the user's selectors and error if they * are suspect. */ #define max_ldt_sel(pcb) \ ((pcb)->pcb_ldt ? (pcb)->pcb_ldt_len : (sizeof(ldt) / sizeof(ldt[0]))) #define valid_ldt_sel(sel) \ (ISLDT(sel) && ISPL(sel) == SEL_UPL && \ IDXSEL(sel) < max_ldt_sel(&p->p_addr->u_pcb)) #define null_sel(sel) \ (!ISLDT(sel) && IDXSEL(sel) == 0) if ((scp->sc_cs&0xffff != _ucodesel && !valid_ldt_sel(scp->sc_cs)) || (scp->sc_ss&0xffff != _udatasel && !valid_ldt_sel(scp->sc_ss)) || (scp->sc_ds&0xffff != _udatasel && !valid_ldt_sel(scp->sc_ds) && !null_sel(scp->sc_ds)) || (scp->sc_es&0xffff != _udatasel && !valid_ldt_sel(scp->sc_es) && !null_sel(scp->sc_es))) { #ifdef DEBUG printf("sigreturn: cs=0x%x ss=0x%x ds=0x%x es=0x%x\n", scp->sc_cs, scp->sc_ss, scp->sc_ds, scp->sc_es); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } #undef max_ldt_sel #undef valid_ldt_sel #undef null_sel /* restore scratch registers */ regs[tEAX] = scp->sc_eax; regs[tEBX] = scp->sc_ebx; regs[tECX] = scp->sc_ecx; regs[tEDX] = scp->sc_edx; regs[tESI] = scp->sc_esi; regs[tEDI] = scp->sc_edi; regs[tCS] = scp->sc_cs; regs[tDS] = scp->sc_ds; regs[tES] = scp->sc_es; regs[tSS] = scp->sc_ss; regs[tISP] = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; p->p_sigmask = scp->sc_mask &~ (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP)); regs[tEBP] = scp->sc_fp; regs[tESP] = scp->sc_sp; regs[tEIP] = scp->sc_pc; regs[tEFLAGS] = eflags; return(EJUSTRETURN); } /* * a simple function to make the system panic (and dump a vmcore) * in a predictable fashion */ void diediedie() { panic("because you said to!"); } int waittime = -1; struct pcb dumppcb; void boot(arghowto) int arghowto; { register long dummy; /* r12 is reserved */ register int howto; /* r11 == how to boot */ register int devtype; /* r10 == major of root dev */ extern int cold; int nomsg = 1; if (cold) { printf("hit reset please"); for(;;); } howto = arghowto; if ((howto&RB_NOSYNC) == 0 && waittime < 0) { register struct buf *bp; int iter, nbusy; waittime = 0; (void) splnet(); printf("syncing disks... "); /* * Release inodes held by texts before update. */ if (panicstr == 0) vnode_pager_umount(NULL); sync(curproc, NULL, NULL); /* * Unmount filesystems */ #if 0 if (panicstr == 0) vfs_unmountall(); #endif for (iter = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY) nbusy++; if (nbusy == 0) break; if (nomsg) { printf("updating disks before rebooting... "); nomsg = 0; } printf("%d ", nbusy); DELAY(40000 * iter); } if (nbusy) printf("giving up\n"); else printf("done\n"); DELAY(10000); /* wait for printf to finish */ } splhigh(); devtype = major(rootdev); if (howto&RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cngetc(); } else { if (howto & RB_DUMP) { savectx(&dumppcb, 0); dumppcb.pcb_ptd = rcr3(); dumpsys(); if (PANIC_REBOOT_WAIT_TIME != 0) { if (PANIC_REBOOT_WAIT_TIME != -1) { int loop; printf("Automatic reboot in %d seconds - press a key on the console to abort\n", PANIC_REBOOT_WAIT_TIME); for (loop = PANIC_REBOOT_WAIT_TIME; loop > 0; --loop) { DELAY(1000 * 1000); /* one second */ if (sgetc(1)) /* Did user type a key? */ break; } if (!loop) goto die; } } else { /* zero time specified - reboot NOW */ goto die; } printf("--> Press a key on the console to reboot <--\n"); cngetc(); } } #ifdef lint dummy = 0; dummy = dummy; printf("howto %d, devtype %d\n", arghowto, devtype); #endif die: printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ cpu_reset(); for(;;) ; /* NOTREACHED */ } unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */ int dumpsize = 0; /* also for savecore */ /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ void dumpsys() { if (dumpdev == NODEV) return; if ((minor(dumpdev)&07) != 1) return; dumpsize = Maxmem; printf("\ndumping to dev %x, offset %d\n", dumpdev, dumplo); printf("dump "); switch ((*bdevsw[major(dumpdev)].d_dump)(dumpdev)) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; default: printf("succeeded\n"); break; } } #ifdef HZ /* * If HZ is defined we use this code, otherwise the code in * /sys/i386/i386/microtime.s is used. The othercode only works * for HZ=100. */ microtime(tvp) register struct timeval *tvp; { int s = splhigh(); *tvp = time; tvp->tv_usec += tick; while (tvp->tv_usec > 1000000) { tvp->tv_sec++; tvp->tv_usec -= 1000000; } splx(s); } #endif /* HZ */ static void initcpu() { } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { p->p_md.md_regs[tEBP] = 0; /* bottom of the fp chain */ p->p_md.md_regs[tEIP] = entry; p->p_md.md_regs[tESP] = stack; p->p_md.md_regs[tSS] = _udatasel; p->p_md.md_regs[tDS] = _udatasel; p->p_md.md_regs[tES] = _udatasel; p->p_md.md_regs[tCS] = _ucodesel; p->p_addr->u_pcb.pcb_flags = 0; /* no fp at all */ load_cr0(rcr0() | CR0_TS); /* start emulating */ #if NNPX > 0 npxinit(__INITIAL_NPXCW__); #endif /* NNPX > 0 */ } /* * machine dependent system variables. */ int cpu_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; void *oldp; size_t *oldlenp; void *newp; size_t newlen; struct proc *p; { /* all sysctl names at this level are terminal */ if (namelen != 1) return (ENOTDIR); /* overloaded */ switch (name[0]) { case CPU_CONSDEV: return (sysctl_rdstruct(oldp, oldlenp, newp, &cn_tty->t_dev, sizeof cn_tty->t_dev)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ union descriptor gdt[NGDT]; union descriptor ldt[NLDT]; /* local descriptor table */ struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ int _default_ldt, currentldt; struct i386tss tss, panic_tss; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Panic Tss Descriptor */ { (int) &panic_tss, /* segment base address */ sizeof(tss)-1, /* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Proc 0 Tss Descriptor */ { (int) kstack, /* segment base address */ sizeof(tss)-1, /* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, }; struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ } }; void setidt(idx, func, typ, dpl) int idx; void (*func)(); int typ; int dpl; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = 8; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) typedef void idtvec_t(); extern idtvec_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(dble), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(rsvd0), IDTVEC(rsvd1), IDTVEC(rsvd2), IDTVEC(rsvd3), IDTVEC(rsvd4), IDTVEC(rsvd5), IDTVEC(rsvd6), IDTVEC(rsvd7), IDTVEC(rsvd8), IDTVEC(rsvd9), IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), IDTVEC(rsvd14), IDTVEC(syscall); int _gsel_tss; void init386(first) int first; { extern ssdtosd(), lgdt(), lidt(), lldt(), etext; int x, *pi; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; extern int sigcode,szsigcode; /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; int pagesinbase, pagesinext; int target_page; extern struct pte *CMAP1; extern caddr_t CADDR1; proc0.p_addr = proc0paddr; /* * Initialize the console before we print anything out. */ cninit (); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(i386_round_page(&etext)) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; for (x=0; x < NGDT; x++) ssdtosd(gdt_segs+x, gdt+x); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * NBPG) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; /* Note. eventually want private ldts per process */ for (x=0; x < 5; x++) ssdtosd(ldt_segs+x, ldt+x); /* exceptions */ setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL); setidt(8, &IDTVEC(dble), SDT_SYS386TGT, SEL_KPL); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL); setidt(17, &IDTVEC(rsvd0), SDT_SYS386TGT, SEL_KPL); setidt(18, &IDTVEC(rsvd1), SDT_SYS386TGT, SEL_KPL); setidt(19, &IDTVEC(rsvd2), SDT_SYS386TGT, SEL_KPL); setidt(20, &IDTVEC(rsvd3), SDT_SYS386TGT, SEL_KPL); setidt(21, &IDTVEC(rsvd4), SDT_SYS386TGT, SEL_KPL); setidt(22, &IDTVEC(rsvd5), SDT_SYS386TGT, SEL_KPL); setidt(23, &IDTVEC(rsvd6), SDT_SYS386TGT, SEL_KPL); setidt(24, &IDTVEC(rsvd7), SDT_SYS386TGT, SEL_KPL); setidt(25, &IDTVEC(rsvd8), SDT_SYS386TGT, SEL_KPL); setidt(26, &IDTVEC(rsvd9), SDT_SYS386TGT, SEL_KPL); setidt(27, &IDTVEC(rsvd10), SDT_SYS386TGT, SEL_KPL); setidt(28, &IDTVEC(rsvd11), SDT_SYS386TGT, SEL_KPL); setidt(29, &IDTVEC(rsvd12), SDT_SYS386TGT, SEL_KPL); setidt(30, &IDTVEC(rsvd13), SDT_SYS386TGT, SEL_KPL); setidt(31, &IDTVEC(rsvd14), SDT_SYS386TGT, SEL_KPL); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #include "ddb.h" #if NDDB > 0 kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) biosbasemem = 640; /* * Some 386 machines might give us a bogus number for extended * mem. If this happens, stop now. */ #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); /* NOTREACHED */ } #endif pagesinbase = biosbasemem * 1024 / NBPG; pagesinext = biosextmem * 1024 / NBPG; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * XXX - this should be removed when bounce buffers are * implemented. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's the highest page of * of the physical address space. It should be "Maxphyspage". */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; #ifdef MAXMEM if (MAXMEM/4 < Maxmem) Maxmem = MAXMEM/4; #endif /* * Calculate number of physical pages, but account for Maxmem * limitation above. */ physmem = pagesinbase + (min(pagesinext + 0x100000/PAGE_SIZE, Maxmem) - 0x100000/PAGE_SIZE); /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Do simple memory test over range of extended memory that BIOS * indicates exists. Adjust Maxmem to the highest page of * good memory. */ printf("Testing memory (%dMB)...", ptoa(Maxmem)/1024/1024); for (target_page = Maxmem - 1; target_page >= atop(first); target_page--) { /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_KW | PG_N | ptoa(target_page); tlbflush(); /* * Test for alternating 1's and 0's */ filli(0xaaaaaaaa, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0xaaaaaaaa)) { Maxmem = target_page; badpages++; continue; } /* * Test for alternating 0's and 1's */ filli(0x55555555, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0x55555555)) { Maxmem = target_page; badpages++; continue; } /* * Test for all 1's */ filli(0xffffffff, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0xffffffff)) { Maxmem = target_page; badpages++; continue; } /* * Test zeroing of page */ bzero(CADDR1, PAGE_SIZE); if (test_page((int *)CADDR1, 0)) { /* * test of page failed */ Maxmem = target_page; badpages++; continue; } } printf("done.\n"); *(int *)CMAP1 = 0; tlbflush(); avail_end = (Maxmem << PAGE_SHIFT) - i386_round_page(sizeof(struct msgbuf)); /* * Initialize pointers to the two chunks of memory; for use * later in vm_page_startup. */ /* avail_start is initialized in pmap_bootstrap */ x = 0; if (pagesinbase > 1) { phys_avail[x++] = NBPG; /* skip first page of memory */ phys_avail[x++] = pagesinbase * NBPG; /* memory up to the ISA hole */ } phys_avail[x++] = avail_start; /* memory up to the end */ phys_avail[x++] = avail_end; phys_avail[x++] = 0; /* no more chunks */ phys_avail[x++] = 0; /* now running on new page tables, configured,and u/iom is accessible */ /* make a initial tss so microp can get interrupt stack on syscall! */ proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG; proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; _gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt = (sizeof(tss))<<16; ltr(_gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode); proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_ptd = IdlePTD; } int test_page(address, pattern) int *address; int pattern; { int *x; for (x = address; x < (int *)((char *)address + PAGE_SIZE); x++) { if (*x != pattern) return (1); } return(0); } /* * The registers are in the frame; the frame is in the user area of * the process in question; when the process is active, the registers * are in "the kernel stack"; when it's not, they're still there, but * things get flipped around. So, since p->p_md.md_regs is the whole address * of the register set, take its offset from the kernel stack, and * index into the user block. Don't you just *love* virtual memory? * (I'm starting to think seymour is right...) */ int ptrace_set_pc (struct proc *p, unsigned int addr) { void *regs = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); ((struct trapframe *)regs)->tf_eip = addr; return 0; } int ptrace_single_step (struct proc *p) { void *regs = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); ((struct trapframe *)regs)->tf_eflags |= PSL_T; return 0; } /* * Copy the registers to user-space. */ int ptrace_getregs (struct proc *p, unsigned int *addr) { int error; struct reg regs = {0}; if (error = fill_regs (p, ®s)) return error; return copyout (®s, addr, sizeof (regs)); } int ptrace_setregs (struct proc *p, unsigned int *addr) { int error; struct reg regs = {0}; if (error = copyin (addr, ®s, sizeof(regs))) return error; return set_regs (p, ®s); } int fill_regs(struct proc *p, struct reg *regs) { int error; struct trapframe *tp; void *ptr = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); tp = ptr; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return 0; } int set_regs (struct proc *p, struct reg *regs) { int error; struct trapframe *tp; void *ptr = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); tp = ptr; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; return 0; } #include "ddb.h" #if NDDB <= 0 void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include #define b_cylin b_resid #define dkpart(dev) (minor(dev) & 7) /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } /* calculate cylinder for disksort to order transfers with */ bp->b_pblkno = bp->b_blkno + p->p_offset; bp->b_cylin = bp->b_pblkno / lp->d_secpercyl; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 1886) +++ head/sys/amd64/amd64/pmap.c (revision 1887) @@ -1,2026 +1,1991 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.26 1994/05/25 08:54:35 rgrimes Exp $ + * $Id: pmap.c,v 1.27 1994/08/03 02:45:28 davidg Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include /* * Allocate various and sundry SYSMAPs used in the days of old VM * and not yet converted. XXX. */ #define BSDVM_COMPAT 1 /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) int protection_codes[8]; struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t phys_avail[6]; /* 2 entries + 1 null */ vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_size_t mem_size; /* memory size in bytes */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss)*/ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int i386pagesperpage; /* PAGE_SIZE / I386_PAGE_SIZE */ boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ vm_offset_t vm_first_phys, vm_last_phys; static inline boolean_t pmap_testbit(); static inline void pmap_changebit(); static inline int pmap_is_managed(); static inline void *vm_get_pmap(); static inline void vm_put_pmap(); inline void pmap_use_pt(); inline void pmap_unuse_pt(); -inline pt_entry_t * const pmap_pte(); +inline pt_entry_t * pmap_pte(); static inline pv_entry_t get_pv_entry(); void pmap_alloc_pv_entry(); void pmap_clear_modify(); -void i386_protection_init(); +static void i386_protection_init(); + +void pmap_kenter __P((vm_offset_t, vm_offset_t)); +void pmap_kremove __P((vm_offset_t)); +void pmap_qenter __P((vm_offset_t, vm_page_t *, int)); +void pmap_qremove __P((vm_offset_t, int)); + extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #if BSDVM_COMPAT #include "msgbuf.h" /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1, *CMAP2, *ptmmap; caddr_t CADDR1, CADDR2, ptvmmap; pt_entry_t *msgbufmap; struct msgbuf *msgbufp; #endif void init_pv_entries(int) ; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ inline pt_entry_t * const pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ( (pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if ( frame != ((int) APTDpde & PG_FRAME) ) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } return((pt_entry_t *) avtopte(va)); } } return(0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { pd_entry_t save; vm_offset_t pa; int s; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ( (pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME)) ) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if ( frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } pa = *(int *) avtopte(va); } pa = (pa & PG_FRAME) | (va & ~PG_FRAME); return pa; } return 0; } /* * determine if a page is managed (memory vs. device) */ static inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { pt = i386_trunc_page( pt); pt = (pt - UPT_MIN_ADDRESS) / NBPG; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; return PHYS_TO_VM_PAGE(pt); } /* * Wire a page table page */ inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if (va >= VM_MAX_ADDRESS || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold( pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if (va >= VM_MAX_ADDRESS || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_unhold( pmap_pte_vm_page(pmap, pt)); } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * Map the kernel's code and data, and allocate the system page table. * * On the I386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define DMAPAGES 8 void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { #if BSDVM_COMPAT vm_offset_t va; pt_entry_t *pte; #endif extern int IdlePTD; avail_start = firstaddr + DMAPAGES*NBPG; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_end = VM_MAX_KERNEL_ADDRESS; i386pagesperpage = PAGE_SIZE / NBPG; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't * have to use pmap_create, which is unlikely to work * correctly at this part of the boot sequence. */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *)(KERNBASE + IdlePTD); simple_lock_init(&kernel_pmap->pm_lock); kernel_pmap->pm_count = 1; #if BSDVM_COMPAT /* * Allocate all the submaps we need */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); SYSMAP(caddr_t ,CMAP1 ,CADDR1 ,1 ) SYSMAP(caddr_t ,CMAP2 ,CADDR2 ,1 ) SYSMAP(caddr_t ,ptmmap ,ptvmmap ,1 ) SYSMAP(struct msgbuf * ,msgbufmap ,msgbufp ,1 ) virtual_avail = va; #endif /* * reserve special hunk of memory for use by bus dma as a bounce * buffer (contiguous virtual *and* physical memory). for now, * assume vm does not use memory beneath hole, and we know that * the bootstrap uses top 32k of base memory. -wfj */ { extern vm_offset_t isaphysmem; isaphysmem = va; virtual_avail = pmap_map(va, firstaddr, firstaddr + DMAPAGES*NBPG, VM_PROT_ALL); } *(int *)CMAP1 = *(int *)CMAP2 = *(int *)PTD = 0; tlbflush(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr, addr2; vm_size_t npg, s; int rv; int i; extern int KPTphys; extern int IdlePTD; /* * Now that kernel map has been allocated, we can mark as * unavailable regions which we have mapped in locore. */ addr = atdevbase; (void) vm_map_find(kernel_map, NULL, (vm_offset_t) 0, &addr, (0x100000-0xa0000), FALSE); addr = (vm_offset_t) KERNBASE + IdlePTD; vm_object_reference(kernel_object); (void) vm_map_find(kernel_map, kernel_object, addr, &addr, (4 + NKPT) * NBPG, FALSE); /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2) ; npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); s = i386_round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return(virt); } /* * Create and return a physical map. * * If the size specified for the map * is zero, the map is an actual physical * map, and may be referenced by the * hardware. * * If the size specified is non-zero, * the map will be used in software only, and * is bounded by that size. * * [ just allocate a ptd and mark it uninitialize -- should we track * with a table which process has which ptd? -wfj ] */ pmap_t pmap_create(size) vm_size_t size; { register pmap_t pmap; /* * Software use map does not need a pmap */ if (size) return(NULL); pmap = (pmap_t) malloc(sizeof *pmap, M_VMPMAP, M_WAITOK); bzero(pmap, sizeof(*pmap)); pmap_pinit(pmap); return (pmap); } struct pmaplist { struct pmaplist *next; }; static inline void * vm_get_pmap() { struct pmaplist *rtval; rtval = (struct pmaplist *)kmem_alloc(kernel_map, ctob(1)); bzero(rtval, ctob(1)); return rtval; } static inline void vm_put_pmap(up) struct pmaplist *up; { kmem_free(kernel_map, (vm_offset_t)up, ctob(1)); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a * valid page directory table. */ pmap->pm_pdir = (pd_entry_t *) vm_get_pmap(); /* wire in kernel global address entries */ bcopy(PTD+KPTDI, pmap->pm_pdir+KPTDI, NKPT*PTESIZE); /* install self-referential address mapping entry */ *(int *)(pmap->pm_pdir+PTDPTDI) = ((int)pmap_kextract((vm_offset_t)pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; simple_lock_init(&pmap->pm_lock); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; simple_lock(&pmap->pm_lock); count = --pmap->pm_count; simple_unlock(&pmap->pm_lock); if (count == 0) { pmap_release(pmap); free((caddr_t)pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_put_pmap((struct pmaplist *) pmap->pm_pdir); } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { simple_lock(&pmap->pm_lock); pmap->pm_count++; simple_unlock(&pmap->pm_lock); } } #define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ int pv_freelistcnt; pv_entry_t pv_freelist; vm_offset_t pvva; int npvvapg; /* * free the pv_entry back to the free list */ inline static void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ while (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; tmp->pv_pmap = 0; tmp->pv_va = 0; tmp->pv_next = 0; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ if (m = vm_page_alloc(kernel_object, pvva-vm_map_min(kernel_map))) { int newentries; int i; pv_entry_t entry; newentries = (NBPG/sizeof (struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ - pmap_enter(vm_map_pmap(kernel_map), pvva, - VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT,1); + pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += NBPG; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic * if this is too small.) */ npvvapg = ((npg*PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1)/NBPG; pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { pt_entry_t *ptp; vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { ptp=PTmap; /* otherwise, we are alternate address space */ } else { if ( frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } ptp=APTmap; } return ptp; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int wired; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { for (npv = pv->pv_next; npv; npv = npv->pv_next) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp,*ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; vm_page_t m; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if( (sva + NBPG) == eva) { if( *pmap_pde( pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if( !*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & (PG_M | PG_U)) { if ((sva < USRSTACK || sva > UPT_MAX_ADDRESS) || (sva >= USRSTACK && sva < USRSTACK+(UPAGES*NBPG))) { if (sva < clean_sva || sva >= clean_eva) { m = PHYS_TO_VM_PAGE(pa); if ((int) oldpte & PG_M) { m->flags &= ~PG_CLEAN; } if ((int) oldpte & PG_U) { m->flags |= PG_REFERENCED; } } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); pmap_unuse_pt(pmap, sva); } tlbflush(); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. * Note: we assume that the page directory table is * always allocated, and in kernel virtual. */ if ( *pmap_pde(pmap, i386_ptob(sva)) == 0 ) { /* We can race ahead here, straight to next pde.. */ nextpde: sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations * that are much faster than * explicitly scanning when page tables are not fully * populated. */ if ( *ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :"=D"(ptq),"=a"(found) :"c"(nscan),"0"(ptq) :"cx"); if( !found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int)oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. * XXX: should cluster them up and invalidate as many * as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we * may be called at interrupt time). */ pa = ((int)oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { ++sva; continue; } if ((((int) oldpte & PG_M) && (va < USRSTACK || va > UPT_MAX_ADDRESS)) || (va >= USRSTACK && va < USRSTACK+(UPAGES*NBPG))) { if (va < clean_sva || va >= clean_eva ) { m = PHYS_TO_VM_PAGE(pa); m->flags &= ~PG_CLEAN; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } tlbflush(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; struct map *map; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ if (!pmap_is_managed(pa)) return; pa = i386_trunc_page(pa); pv = pa_to_pvh(pa); m = PHYS_TO_VM_PAGE(pa); s = splhigh(); while (pv->pv_pmap != NULL) { pmap = pv->pv_pmap; ptp = get_pt_entry(pmap); va = i386_btop(pv->pv_va); pte = ptp + va; if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if ( *pte) { pmap->pm_stats.resident_count--; anyvalid++; /* * update the vm_page_t clean bit */ if ( (m->flags & PG_CLEAN) && ((((int) *pte) & PG_M) && (pv->pv_va < USRSTACK || pv->pv_va > UPT_MAX_ADDRESS)) || (pv->pv_va >= USRSTACK && pv->pv_va < USRSTACK+(UPAGES*NBPG))) { if (pv->pv_va < clean_sva || pv->pv_va >= clean_eva) { m->flags &= ~PG_CLEAN; } } *pte = 0; } pmap_unuse_pt(pmap, pv->pv_va); npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } splx(s); if (anyvalid) tlbflush(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int s; int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found=0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. * Skip it, we don't want to force allocation * of unnecessary PTE pages just to set the protection. */ if (! *pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t)-1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if( *pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :"=D"(pte),"=a"(found) :"c"(nscan),"0"(pte):"cx"); if( !found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if( va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) tlbflush(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int cacheable=1; int ptevalid = 0; if (pmap == NULL) return; va = i386_trunc_page(va); pa = i386_trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS)panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if ( *pmap_pde(pmap, va) == 0) { pg("ptdi %x, va %x", pmap->pm_pdir[PTDPTDI], va); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. * We don't worry about wiring PT pages as they remain * resident as long as there are valid mappings in them. * Hence, if a user page is wired, the PT page will be also. */ if (wired && !pmap_pte_w(pte) || !wired && pmap_pte_w(pte)) { if (wired) pmap->pm_stats.wired_count++; else pmap->pm_stats.wired_count--; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory * Note that we raise IPL while manipulating pv_table * since pmap_enter can be called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. * Place this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); cacheable = 1; } else { cacheable = 0; } pmap_use_pt(pmap, va); /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ( (int) (pa | pte_prot(pmap, prot) | PG_V)); /* * for correctness: */ if( !cacheable) (int) npte |= PG_N; /* * When forking (copy-on-write, etc): * A process will turn off write permissions for any of its writable * pages. If the data (object) is only referred to by one process, the * processes map is modified directly as opposed to using the * object manipulation routine. When using pmap_protect, the * modified bits are not kept in the vm_page_t data structure. * Therefore, when using pmap_enter in vm_fault to bring back * writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. * this clause supports the carryover of the modified and * used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M|PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if(*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) tlbflush(); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for(i=0;ipm_stats.wired_count++; - } - goto validate; - } - - if (opa) { - pmap_remove(kernel_pmap, va, va + PAGE_SIZE); - } - - pv = pa_to_pvh(pa); - s = splhigh(); - /* - * No entries yet, use header as the first entry - */ - if (pv->pv_pmap == NULL) { - pv->pv_va = va; - pv->pv_pmap = kernel_pmap; - pv->pv_next = NULL; - } - /* - * There is at least one other VA mapping this page. - * Place this entry after the header. - */ - else { - npv = get_pv_entry(); - npv->pv_va = va; - npv->pv_pmap = kernel_pmap; - npv->pv_next = pv->pv_next; - pv->pv_next = npv; - } - splx(s); - - /* - * Increment counters - */ - kernel_pmap->pm_stats.resident_count++; - -validate: - - /* - * Now validate mapping with desired protection/wiring. - */ - *pte = (pt_entry_t) ( (int) (pa | PG_RW | PG_V | PG_W)); + *pte = (pt_entry_t) 0; + tlbflush(); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static inline int pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; int anyvalid = 0; /* * Enter on the PV list if part of our managed memory * Note that we raise IPL while manipulating pv_table * since pmap_enter can be called at interrupt time. */ pte = vtopte(va); if (pmap_pte_pa(pte)) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. * Place this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); pmap_use_pt(pmap, va); /* * Increment counters */ pmap->pm_stats.resident_count++; validate: if (*pte) anyvalid++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ( (int) (pa | PG_V | PG_u)); return (anyvalid); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, offset, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_offset_t offset; vm_offset_t size; { vm_offset_t tmpoff; vm_page_t p; int s; vm_offset_t v, lastv=0; pt_entry_t pte; extern vm_map_t kernel_map; vm_offset_t objbytes; int anyvalid = 0; if (!pmap) return; /* * if we are processing a major portion of the object, then * scan the entire thing. */ if( size > object->size / 2) { objbytes = size; p = object->memq.tqh_first; while ((p != NULL) && (objbytes != 0)) { tmpoff = p->offset; if( tmpoff < offset) { p = p->listq.tqe_next; continue; } tmpoff -= offset; if( tmpoff >= size) { p = p->listq.tqe_next; continue; } if ((p->flags & (PG_BUSY|PG_FICTITIOUS)) == 0 ) { vm_page_hold(p); v = i386_trunc_page(((vm_offset_t)vtopte( addr+tmpoff))); /* a fault might occur here */ *(volatile char *)v += 0; vm_page_unhold(p); anyvalid += pmap_enter_quick(pmap, addr+tmpoff, VM_PAGE_TO_PHYS(p)); } p = p->listq.tqe_next; objbytes -= NBPG; } } else { /* * else lookup the pages one-by-one. */ for(tmpoff = 0; tmpoff < size; tmpoff += NBPG) { if( p = vm_page_lookup(object, tmpoff + offset)) { if( (p->flags & (PG_BUSY|PG_FICTITIOUS)) == 0) { vm_page_hold(p); v = i386_trunc_page(((vm_offset_t)vtopte( addr+tmpoff))); /* a fault might occur here */ *(volatile char *)v += 0; vm_page_unhold(p); anyvalid += pmap_enter_quick(pmap, addr+tmpoff, VM_PAGE_TO_PHYS(p)); } } } } if (anyvalid) tlbflush(); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte) || !wired && pmap_pte_w(pte)) { if (wired) pmap->pm_stats.wired_count++; else pmap->pm_stats.wired_count--; } /* * Wiring is not a hardware characteristic so there is no need * to invalidate TLB. */ pmap_pte_set_w(pte, wired); /* * When unwiring, set the modified bit in the pte -- could have * been changed by the kernel */ if (!wired) (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Require that all active physical maps contain no * incorrect entries NOW. [This update includes * forcing updates of any address map caching.] * * Generally used to insure that a thread about * to run will see a semantically correct world. */ void pmap_update() { tlbflush(); } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *)CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *)CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); bzero(CADDR2,NBPG); *(int *)CMAP2 = 0; tlbflush(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *)CMAP1 || *(int *)CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *)CMAP1 = PG_V | PG_KW | i386_trunc_page(src); *(int *)CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); #if __GNUC__ > 1 memcpy(CADDR2, CADDR1, NBPG); #else bcopy(CADDR1, CADDR2, NBPG); #endif *(int *)CMAP1 = 0; *(int *)CMAP2 = 0; tlbflush(); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning * immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return(FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning * immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, * then mark UPAGES as always modified, and * ptes as never modified. */ if (bit & PG_U ) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } if (bit & PG_M ) { if (pv->pv_va >= USRSTACK) { if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { continue; } if (pv->pv_va < USRSTACK+(UPAGES*NBPG)) { splx(s); return TRUE; } else if (pv->pv_va < UPT_MAX_ADDRESS) { splx(s); return FALSE; } } } + if( !pv->pv_pmap) { + printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); + continue; + } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return(FALSE); } /* * this routine is used to modify bits in ptes */ static inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos * If setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } + if( !pv->pv_pmap) { + printf("Null pmap (cb) at va: 0x%lx\n", va); + continue; + } pte = pmap_pte(pv->pv_pmap, va); if (setem) (int) npte = (int) *pte | bit; else (int) npte = (int) *pte & ~bit; *pte = npte; } } splx(s); tlbflush(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(pa) vm_offset_t pa; { pmap_changebit(pa, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(pa) vm_offset_t pa; { pmap_changebit(pa, PG_U, FALSE); } /* * pmap_is_referenced: * * Return whether or not the specified physical page is referenced * by any physical maps. */ boolean_t pmap_is_referenced(pa) vm_offset_t pa; { return(pmap_testbit(pa, PG_U)); } /* * pmap_is_modified: * * Return whether or not the specified physical page is modified * by any physical maps. */ boolean_t pmap_is_modified(pa) vm_offset_t pa; { return(pmap_testbit(pa, PG_M)); } /* * Routine: pmap_copy_on_write * Function: * Remove write privileges from all * physical maps for this physical page. */ void pmap_copy_on_write(pa) vm_offset_t pa; { pmap_changebit(pa, PG_RW, FALSE); } vm_offset_t pmap_phys_address(ppn) int ppn; { return(i386_ptob(ppn)); } /* * Miscellaneous support routines follow */ void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute * bit, so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } #ifdef DEBUG /* print address space of pmap*/ void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024 ; j++) { va = (i< UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *)ptep); } ; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 1886) +++ head/sys/i386/i386/machdep.c (revision 1887) @@ -1,1532 +1,1535 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.45 1994/08/03 02:45:26 davidg Exp $ + * $Id: machdep.c,v 1.46 1994/08/04 06:10:27 davidg Exp $ */ #include "npx.h" #include "isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include "sys/shm.h" #endif #ifdef SYSVMSG #include "msg.h" #endif #ifdef SYSVSEM #include "sem.h" #endif #include "vm/vm.h" #include "vm/vm_kern.h" #include "vm/vm_page.h" #include "sys/exec.h" #include "sys/vnode.h" extern vm_offset_t avail_start, avail_end; #include "machine/cpu.h" #include "machine/reg.h" #include "machine/psl.h" #include "machine/specialreg.h" #include "machine/sysarch.h" #include "machine/cons.h" #include "i386/isa/isa.h" #include "i386/isa/rtc.h" static void identifycpu(void); static void initcpu(void); static int test_page(int *, int); extern int grow(struct proc *,u_int); const char machine[] = "PC-Class"; const char *cpu_model; #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif /* * Declare these as initialized data so we can patch them. */ int nswbuf = 0; #ifdef NBUF int nbuf = NBUF; #else int nbuf = 0; #endif #ifdef BUFPAGES int bufpages = BUFPAGES; #else int bufpages = 0; #endif #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif int msgbufmapped = 0; /* set when safe to use msgbuf */ extern int freebufspace; extern char *bouncememory; int _udatasel, _ucodesel; /* * Machine-dependent startup code */ int boothowto = 0, Maxmem = 0, badpages = 0, physmem = 0; long dumplo; extern int bootdev; int biosmem; vm_offset_t phys_avail[6]; extern cyloffset; int cpu_class; void dumpsys __P((void)); vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; vm_offset_t pager_sva, pager_eva; int maxbkva, pager_map_size; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) void cpu_startup() { register int unixsize; register unsigned i; register struct pte *pte; int mapaddr, j; register caddr_t v; int maxbufs, base, residual; extern long Usrptsize; vm_offset_t minaddr, maxaddr; vm_size_t size = 0; int firstaddr; /* * Initialize error message buffer (at end of core). */ /* avail_end was pre-decremented in init_386() to compensate */ for (i = 0; i < btoc(sizeof (struct msgbuf)); i++) pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp, avail_end + i * NBPG, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* * Good {morning,afternoon,evening,night}. */ printf(version); identifycpu(); printf("real memory = %d (%d pages)\n", ptoa(physmem), physmem); if (badpages) printf("bad memory = %d (%d pages)\n", ptoa(badpages), badpages); /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif /* * Determine how many buffers to allocate. * Use 20% of memory of memory beyond the first 2MB * Insure a minimum of 16 fs buffers. * We allocate 1/2 as many swap buffer headers as file i/o buffers. */ if (bufpages == 0) bufpages = ((physmem << PGSHIFT) - 2048*1024) / NBPG / 5; if (bufpages < 64) bufpages = 64; /* * We must still limit the maximum number of buffers to be no * more than 2/5's of the size of the kernal malloc region, this * will only take effect for machines with lots of memory */ bufpages = min(bufpages, (VM_KMEM_SIZE / NBPG) * 2 / 5); if (nbuf == 0) { nbuf = bufpages / 2; if (nbuf < 32) nbuf = 32; } freebufspace = bufpages * NBPG; if (nswbuf == 0) { nswbuf = (nbuf / 2) &~ 1; /* force even */ - if (nswbuf > 256) - nswbuf = 256; /* sanity */ + if (nswbuf > 64) + nswbuf = 64; /* sanity */ } valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifndef NOBOUNCE /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) bouncepages = 96; /* largest physio size + extra */ v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, - (nbuf*MAXBSIZE) + VM_PHYS_SIZE + maxbkva + pager_map_size, TRUE); - - io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); + (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + + maxbkva + pager_map_size, TRUE); + buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, + (nbuf*MAXBSIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, - pager_map_size, TRUE); + (nswbuf*MAXPHYS) + pager_map_size, TRUE); + io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); - buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, - (nbuf * MAXBSIZE), TRUE); +#if 0 /* * Allocate a submap for physio */ phys_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, VM_PHYS_SIZE, TRUE); +#endif /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ mclrefcnt = (char *)malloc(NMBCLUSTERS+CLBYTES/MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, NMBCLUSTERS+CLBYTES/MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, VM_MBUF_SIZE, FALSE); /* * Initialize callouts */ callfree = callout; for (i = 1; i < ncallout; i++) callout[i-1].c_next = &callout[i]; printf("avail memory = %d (%d pages)\n", ptoa(cnt.v_free_count), cnt.v_free_count); printf("using %d buffers containing %d bytes of memory\n", nbuf, bufpages * CLBYTES); #ifndef NOBOUNCE /* * init bounce buffers */ vm_bounce_init(); #endif /* * Set up CPU-specific registers, cache, etc. */ initcpu(); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); + vm_pager_bufferinit(); /* * Configure the system. */ configure(); } struct cpu_nameclass i386_cpus[] = { { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ { "i486SX", CPUCLASS_486 }, /* CPU_486SX */ { "i486DX", CPUCLASS_486 }, /* CPU_486 */ { "i586", CPUCLASS_586 }, /* CPU_586 */ }; static void identifycpu() { printf("CPU: "); if (cpu >= 0 && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) { printf("%s", i386_cpus[cpu].cpu_name); cpu_class = i386_cpus[cpu].cpu_class; cpu_model = i386_cpus[cpu].cpu_name; } else { printf("unknown cpu type %d\n", cpu); panic("startup: bad cpu id"); } printf(" ("); switch(cpu_class) { case CPUCLASS_286: printf("286"); break; case CPUCLASS_386: printf("386"); break; case CPUCLASS_486: printf("486"); break; case CPUCLASS_586: printf("586"); break; default: printf("unknown"); /* will panic below... */ } printf("-class CPU)"); printf("\n"); /* cpu speed would be nice, but how? */ /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { case CPUCLASS_286: /* a 286 should not make it this far, anyway */ #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) #error This kernel is not configured for one of the supported CPUs #endif #if !defined(I386_CPU) case CPUCLASS_386: #endif #if !defined(I486_CPU) case CPUCLASS_486: #endif #if !defined(I586_CPU) case CPUCLASS_586: #endif panic("CPU class not configured"); default: break; } } #ifdef PGINPROF /* * Return the difference (in microseconds) * between the current time and a previous * time as represented by the arguments. * If there is a pending clock interrupt * which has not been serviced due to high * ipl, return error code. */ /*ARGSUSED*/ vmtime(otime, olbolt, oicr) register int otime, olbolt, oicr; { return (((time.tv_sec-otime)*60 + lbolt-olbolt)*16667); } #endif extern int kstack[]; /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; unsigned code; { register struct proc *p = curproc; register int *regs; register struct sigframe *fp; struct sigacts *psp = p->p_sigacts; int oonstack, frmtrap; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((psp->ps_flags & SAS_ALTSTACK) && (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_base + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SA_ONSTACK; } else { fp = (struct sigframe *)(regs[tESP] - sizeof(struct sigframe)); } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ fp->sf_signum = sig; fp->sf_code = code; fp->sf_scp = &fp->sf_sc; fp->sf_addr = (char *) regs[tERR]; fp->sf_handler = catcher; /* save scratch registers */ fp->sf_sc.sc_eax = regs[tEAX]; fp->sf_sc.sc_ebx = regs[tEBX]; fp->sf_sc.sc_ecx = regs[tECX]; fp->sf_sc.sc_edx = regs[tEDX]; fp->sf_sc.sc_esi = regs[tESI]; fp->sf_sc.sc_edi = regs[tEDI]; fp->sf_sc.sc_cs = regs[tCS]; fp->sf_sc.sc_ds = regs[tDS]; fp->sf_sc.sc_ss = regs[tSS]; fp->sf_sc.sc_es = regs[tES]; fp->sf_sc.sc_isp = regs[tISP]; /* * Build the signal context to be used by sigreturn. */ fp->sf_sc.sc_onstack = oonstack; fp->sf_sc.sc_mask = mask; fp->sf_sc.sc_sp = regs[tESP]; fp->sf_sc.sc_fp = regs[tEBP]; fp->sf_sc.sc_pc = regs[tEIP]; fp->sf_sc.sc_ps = regs[tEFLAGS]; regs[tESP] = (int)fp; regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc; regs[tEFLAGS] &= ~PSL_VM; regs[tCS] = _ucodesel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tSS] = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ struct sigreturn_args { struct sigcontext *sigcntxp; }; int sigreturn(p, uap, retval) struct proc *p; struct sigreturn_args *uap; int *retval; { register struct sigcontext *scp; register struct sigframe *fp; register int *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs[tESP] points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0) return(EINVAL); eflags = scp->sc_ps; if ((eflags & PSL_USERCLR) != 0 || (eflags & PSL_USERSET) != PSL_USERSET || (eflags & PSL_IOPL) < (regs[tEFLAGS] & PSL_IOPL)) { #ifdef DEBUG printf("sigreturn: eflags=0x%x\n", eflags); #endif return(EINVAL); } /* * Sanity check the user's selectors and error if they * are suspect. */ #define max_ldt_sel(pcb) \ ((pcb)->pcb_ldt ? (pcb)->pcb_ldt_len : (sizeof(ldt) / sizeof(ldt[0]))) #define valid_ldt_sel(sel) \ (ISLDT(sel) && ISPL(sel) == SEL_UPL && \ IDXSEL(sel) < max_ldt_sel(&p->p_addr->u_pcb)) #define null_sel(sel) \ (!ISLDT(sel) && IDXSEL(sel) == 0) if ((scp->sc_cs&0xffff != _ucodesel && !valid_ldt_sel(scp->sc_cs)) || (scp->sc_ss&0xffff != _udatasel && !valid_ldt_sel(scp->sc_ss)) || (scp->sc_ds&0xffff != _udatasel && !valid_ldt_sel(scp->sc_ds) && !null_sel(scp->sc_ds)) || (scp->sc_es&0xffff != _udatasel && !valid_ldt_sel(scp->sc_es) && !null_sel(scp->sc_es))) { #ifdef DEBUG printf("sigreturn: cs=0x%x ss=0x%x ds=0x%x es=0x%x\n", scp->sc_cs, scp->sc_ss, scp->sc_ds, scp->sc_es); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } #undef max_ldt_sel #undef valid_ldt_sel #undef null_sel /* restore scratch registers */ regs[tEAX] = scp->sc_eax; regs[tEBX] = scp->sc_ebx; regs[tECX] = scp->sc_ecx; regs[tEDX] = scp->sc_edx; regs[tESI] = scp->sc_esi; regs[tEDI] = scp->sc_edi; regs[tCS] = scp->sc_cs; regs[tDS] = scp->sc_ds; regs[tES] = scp->sc_es; regs[tSS] = scp->sc_ss; regs[tISP] = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; p->p_sigmask = scp->sc_mask &~ (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP)); regs[tEBP] = scp->sc_fp; regs[tESP] = scp->sc_sp; regs[tEIP] = scp->sc_pc; regs[tEFLAGS] = eflags; return(EJUSTRETURN); } /* * a simple function to make the system panic (and dump a vmcore) * in a predictable fashion */ void diediedie() { panic("because you said to!"); } int waittime = -1; struct pcb dumppcb; void boot(arghowto) int arghowto; { register long dummy; /* r12 is reserved */ register int howto; /* r11 == how to boot */ register int devtype; /* r10 == major of root dev */ extern int cold; int nomsg = 1; if (cold) { printf("hit reset please"); for(;;); } howto = arghowto; if ((howto&RB_NOSYNC) == 0 && waittime < 0) { register struct buf *bp; int iter, nbusy; waittime = 0; (void) splnet(); printf("syncing disks... "); /* * Release inodes held by texts before update. */ if (panicstr == 0) vnode_pager_umount(NULL); sync(curproc, NULL, NULL); /* * Unmount filesystems */ #if 0 if (panicstr == 0) vfs_unmountall(); #endif for (iter = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY) nbusy++; if (nbusy == 0) break; if (nomsg) { printf("updating disks before rebooting... "); nomsg = 0; } printf("%d ", nbusy); DELAY(40000 * iter); } if (nbusy) printf("giving up\n"); else printf("done\n"); DELAY(10000); /* wait for printf to finish */ } splhigh(); devtype = major(rootdev); if (howto&RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cngetc(); } else { if (howto & RB_DUMP) { savectx(&dumppcb, 0); dumppcb.pcb_ptd = rcr3(); dumpsys(); if (PANIC_REBOOT_WAIT_TIME != 0) { if (PANIC_REBOOT_WAIT_TIME != -1) { int loop; printf("Automatic reboot in %d seconds - press a key on the console to abort\n", PANIC_REBOOT_WAIT_TIME); for (loop = PANIC_REBOOT_WAIT_TIME; loop > 0; --loop) { DELAY(1000 * 1000); /* one second */ if (sgetc(1)) /* Did user type a key? */ break; } if (!loop) goto die; } } else { /* zero time specified - reboot NOW */ goto die; } printf("--> Press a key on the console to reboot <--\n"); cngetc(); } } #ifdef lint dummy = 0; dummy = dummy; printf("howto %d, devtype %d\n", arghowto, devtype); #endif die: printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ cpu_reset(); for(;;) ; /* NOTREACHED */ } unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */ int dumpsize = 0; /* also for savecore */ /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ void dumpsys() { if (dumpdev == NODEV) return; if ((minor(dumpdev)&07) != 1) return; dumpsize = Maxmem; printf("\ndumping to dev %x, offset %d\n", dumpdev, dumplo); printf("dump "); switch ((*bdevsw[major(dumpdev)].d_dump)(dumpdev)) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; default: printf("succeeded\n"); break; } } #ifdef HZ /* * If HZ is defined we use this code, otherwise the code in * /sys/i386/i386/microtime.s is used. The othercode only works * for HZ=100. */ microtime(tvp) register struct timeval *tvp; { int s = splhigh(); *tvp = time; tvp->tv_usec += tick; while (tvp->tv_usec > 1000000) { tvp->tv_sec++; tvp->tv_usec -= 1000000; } splx(s); } #endif /* HZ */ static void initcpu() { } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { p->p_md.md_regs[tEBP] = 0; /* bottom of the fp chain */ p->p_md.md_regs[tEIP] = entry; p->p_md.md_regs[tESP] = stack; p->p_md.md_regs[tSS] = _udatasel; p->p_md.md_regs[tDS] = _udatasel; p->p_md.md_regs[tES] = _udatasel; p->p_md.md_regs[tCS] = _ucodesel; p->p_addr->u_pcb.pcb_flags = 0; /* no fp at all */ load_cr0(rcr0() | CR0_TS); /* start emulating */ #if NNPX > 0 npxinit(__INITIAL_NPXCW__); #endif /* NNPX > 0 */ } /* * machine dependent system variables. */ int cpu_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) int *name; u_int namelen; void *oldp; size_t *oldlenp; void *newp; size_t newlen; struct proc *p; { /* all sysctl names at this level are terminal */ if (namelen != 1) return (ENOTDIR); /* overloaded */ switch (name[0]) { case CPU_CONSDEV: return (sysctl_rdstruct(oldp, oldlenp, newp, &cn_tty->t_dev, sizeof cn_tty->t_dev)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ union descriptor gdt[NGDT]; union descriptor ldt[NLDT]; /* local descriptor table */ struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ int _default_ldt, currentldt; struct i386tss tss, panic_tss; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Panic Tss Descriptor */ { (int) &panic_tss, /* segment base address */ sizeof(tss)-1, /* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Proc 0 Tss Descriptor */ { (int) kstack, /* segment base address */ sizeof(tss)-1, /* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, }; struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ } }; void setidt(idx, func, typ, dpl) int idx; void (*func)(); int typ; int dpl; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = 8; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) typedef void idtvec_t(); extern idtvec_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(dble), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(rsvd0), IDTVEC(rsvd1), IDTVEC(rsvd2), IDTVEC(rsvd3), IDTVEC(rsvd4), IDTVEC(rsvd5), IDTVEC(rsvd6), IDTVEC(rsvd7), IDTVEC(rsvd8), IDTVEC(rsvd9), IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), IDTVEC(rsvd14), IDTVEC(syscall); int _gsel_tss; void init386(first) int first; { extern ssdtosd(), lgdt(), lidt(), lldt(), etext; int x, *pi; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; extern int sigcode,szsigcode; /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; int pagesinbase, pagesinext; int target_page; extern struct pte *CMAP1; extern caddr_t CADDR1; proc0.p_addr = proc0paddr; /* * Initialize the console before we print anything out. */ cninit (); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(i386_round_page(&etext)) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; for (x=0; x < NGDT; x++) ssdtosd(gdt_segs+x, gdt+x); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * NBPG) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; /* Note. eventually want private ldts per process */ for (x=0; x < 5; x++) ssdtosd(ldt_segs+x, ldt+x); /* exceptions */ setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL); setidt(8, &IDTVEC(dble), SDT_SYS386TGT, SEL_KPL); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL); setidt(17, &IDTVEC(rsvd0), SDT_SYS386TGT, SEL_KPL); setidt(18, &IDTVEC(rsvd1), SDT_SYS386TGT, SEL_KPL); setidt(19, &IDTVEC(rsvd2), SDT_SYS386TGT, SEL_KPL); setidt(20, &IDTVEC(rsvd3), SDT_SYS386TGT, SEL_KPL); setidt(21, &IDTVEC(rsvd4), SDT_SYS386TGT, SEL_KPL); setidt(22, &IDTVEC(rsvd5), SDT_SYS386TGT, SEL_KPL); setidt(23, &IDTVEC(rsvd6), SDT_SYS386TGT, SEL_KPL); setidt(24, &IDTVEC(rsvd7), SDT_SYS386TGT, SEL_KPL); setidt(25, &IDTVEC(rsvd8), SDT_SYS386TGT, SEL_KPL); setidt(26, &IDTVEC(rsvd9), SDT_SYS386TGT, SEL_KPL); setidt(27, &IDTVEC(rsvd10), SDT_SYS386TGT, SEL_KPL); setidt(28, &IDTVEC(rsvd11), SDT_SYS386TGT, SEL_KPL); setidt(29, &IDTVEC(rsvd12), SDT_SYS386TGT, SEL_KPL); setidt(30, &IDTVEC(rsvd13), SDT_SYS386TGT, SEL_KPL); setidt(31, &IDTVEC(rsvd14), SDT_SYS386TGT, SEL_KPL); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #include "ddb.h" #if NDDB > 0 kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) biosbasemem = 640; /* * Some 386 machines might give us a bogus number for extended * mem. If this happens, stop now. */ #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); /* NOTREACHED */ } #endif pagesinbase = biosbasemem * 1024 / NBPG; pagesinext = biosextmem * 1024 / NBPG; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * XXX - this should be removed when bounce buffers are * implemented. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's the highest page of * of the physical address space. It should be "Maxphyspage". */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; #ifdef MAXMEM if (MAXMEM/4 < Maxmem) Maxmem = MAXMEM/4; #endif /* * Calculate number of physical pages, but account for Maxmem * limitation above. */ physmem = pagesinbase + (min(pagesinext + 0x100000/PAGE_SIZE, Maxmem) - 0x100000/PAGE_SIZE); /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Do simple memory test over range of extended memory that BIOS * indicates exists. Adjust Maxmem to the highest page of * good memory. */ printf("Testing memory (%dMB)...", ptoa(Maxmem)/1024/1024); for (target_page = Maxmem - 1; target_page >= atop(first); target_page--) { /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_KW | PG_N | ptoa(target_page); tlbflush(); /* * Test for alternating 1's and 0's */ filli(0xaaaaaaaa, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0xaaaaaaaa)) { Maxmem = target_page; badpages++; continue; } /* * Test for alternating 0's and 1's */ filli(0x55555555, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0x55555555)) { Maxmem = target_page; badpages++; continue; } /* * Test for all 1's */ filli(0xffffffff, CADDR1, PAGE_SIZE/sizeof(int)); if (test_page((int *)CADDR1, 0xffffffff)) { Maxmem = target_page; badpages++; continue; } /* * Test zeroing of page */ bzero(CADDR1, PAGE_SIZE); if (test_page((int *)CADDR1, 0)) { /* * test of page failed */ Maxmem = target_page; badpages++; continue; } } printf("done.\n"); *(int *)CMAP1 = 0; tlbflush(); avail_end = (Maxmem << PAGE_SHIFT) - i386_round_page(sizeof(struct msgbuf)); /* * Initialize pointers to the two chunks of memory; for use * later in vm_page_startup. */ /* avail_start is initialized in pmap_bootstrap */ x = 0; if (pagesinbase > 1) { phys_avail[x++] = NBPG; /* skip first page of memory */ phys_avail[x++] = pagesinbase * NBPG; /* memory up to the ISA hole */ } phys_avail[x++] = avail_start; /* memory up to the end */ phys_avail[x++] = avail_end; phys_avail[x++] = 0; /* no more chunks */ phys_avail[x++] = 0; /* now running on new page tables, configured,and u/iom is accessible */ /* make a initial tss so microp can get interrupt stack on syscall! */ proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG; proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; _gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt = (sizeof(tss))<<16; ltr(_gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode); proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_ptd = IdlePTD; } int test_page(address, pattern) int *address; int pattern; { int *x; for (x = address; x < (int *)((char *)address + PAGE_SIZE); x++) { if (*x != pattern) return (1); } return(0); } /* * The registers are in the frame; the frame is in the user area of * the process in question; when the process is active, the registers * are in "the kernel stack"; when it's not, they're still there, but * things get flipped around. So, since p->p_md.md_regs is the whole address * of the register set, take its offset from the kernel stack, and * index into the user block. Don't you just *love* virtual memory? * (I'm starting to think seymour is right...) */ int ptrace_set_pc (struct proc *p, unsigned int addr) { void *regs = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); ((struct trapframe *)regs)->tf_eip = addr; return 0; } int ptrace_single_step (struct proc *p) { void *regs = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); ((struct trapframe *)regs)->tf_eflags |= PSL_T; return 0; } /* * Copy the registers to user-space. */ int ptrace_getregs (struct proc *p, unsigned int *addr) { int error; struct reg regs = {0}; if (error = fill_regs (p, ®s)) return error; return copyout (®s, addr, sizeof (regs)); } int ptrace_setregs (struct proc *p, unsigned int *addr) { int error; struct reg regs = {0}; if (error = copyin (addr, ®s, sizeof(regs))) return error; return set_regs (p, ®s); } int fill_regs(struct proc *p, struct reg *regs) { int error; struct trapframe *tp; void *ptr = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); tp = ptr; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return 0; } int set_regs (struct proc *p, struct reg *regs) { int error; struct trapframe *tp; void *ptr = (char*)p->p_addr + ((char*) p->p_md.md_regs - (char*) kstack); tp = ptr; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; return 0; } #include "ddb.h" #if NDDB <= 0 void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include #define b_cylin b_resid #define dkpart(dev) (minor(dev) & 7) /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } /* calculate cylinder for disksort to order transfers with */ bp->b_pblkno = bp->b_blkno + p->p_offset; bp->b_cylin = bp->b_pblkno / lp->d_secpercyl; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 1886) +++ head/sys/i386/i386/pmap.c (revision 1887) @@ -1,2026 +1,1991 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.26 1994/05/25 08:54:35 rgrimes Exp $ + * $Id: pmap.c,v 1.27 1994/08/03 02:45:28 davidg Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include /* * Allocate various and sundry SYSMAPs used in the days of old VM * and not yet converted. XXX. */ #define BSDVM_COMPAT 1 /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) int protection_codes[8]; struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t phys_avail[6]; /* 2 entries + 1 null */ vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_size_t mem_size; /* memory size in bytes */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss)*/ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int i386pagesperpage; /* PAGE_SIZE / I386_PAGE_SIZE */ boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ vm_offset_t vm_first_phys, vm_last_phys; static inline boolean_t pmap_testbit(); static inline void pmap_changebit(); static inline int pmap_is_managed(); static inline void *vm_get_pmap(); static inline void vm_put_pmap(); inline void pmap_use_pt(); inline void pmap_unuse_pt(); -inline pt_entry_t * const pmap_pte(); +inline pt_entry_t * pmap_pte(); static inline pv_entry_t get_pv_entry(); void pmap_alloc_pv_entry(); void pmap_clear_modify(); -void i386_protection_init(); +static void i386_protection_init(); + +void pmap_kenter __P((vm_offset_t, vm_offset_t)); +void pmap_kremove __P((vm_offset_t)); +void pmap_qenter __P((vm_offset_t, vm_page_t *, int)); +void pmap_qremove __P((vm_offset_t, int)); + extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; #if BSDVM_COMPAT #include "msgbuf.h" /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1, *CMAP2, *ptmmap; caddr_t CADDR1, CADDR2, ptvmmap; pt_entry_t *msgbufmap; struct msgbuf *msgbufp; #endif void init_pv_entries(int) ; /* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ inline pt_entry_t * const pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ( (pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if ( frame != ((int) APTDpde & PG_FRAME) ) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } return((pt_entry_t *) avtopte(va)); } } return(0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { pd_entry_t save; vm_offset_t pa; int s; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ( (pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME)) ) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if ( frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } pa = *(int *) avtopte(va); } pa = (pa & PG_FRAME) | (va & ~PG_FRAME); return pa; } return 0; } /* * determine if a page is managed (memory vs. device) */ static inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { pt = i386_trunc_page( pt); pt = (pt - UPT_MIN_ADDRESS) / NBPG; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; return PHYS_TO_VM_PAGE(pt); } /* * Wire a page table page */ inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if (va >= VM_MAX_ADDRESS || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold( pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if (va >= VM_MAX_ADDRESS || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_unhold( pmap_pte_vm_page(pmap, pt)); } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * Map the kernel's code and data, and allocate the system page table. * * On the I386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ #define DMAPAGES 8 void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { #if BSDVM_COMPAT vm_offset_t va; pt_entry_t *pte; #endif extern int IdlePTD; avail_start = firstaddr + DMAPAGES*NBPG; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_end = VM_MAX_KERNEL_ADDRESS; i386pagesperpage = PAGE_SIZE / NBPG; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't * have to use pmap_create, which is unlikely to work * correctly at this part of the boot sequence. */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *)(KERNBASE + IdlePTD); simple_lock_init(&kernel_pmap->pm_lock); kernel_pmap->pm_count = 1; #if BSDVM_COMPAT /* * Allocate all the submaps we need */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); SYSMAP(caddr_t ,CMAP1 ,CADDR1 ,1 ) SYSMAP(caddr_t ,CMAP2 ,CADDR2 ,1 ) SYSMAP(caddr_t ,ptmmap ,ptvmmap ,1 ) SYSMAP(struct msgbuf * ,msgbufmap ,msgbufp ,1 ) virtual_avail = va; #endif /* * reserve special hunk of memory for use by bus dma as a bounce * buffer (contiguous virtual *and* physical memory). for now, * assume vm does not use memory beneath hole, and we know that * the bootstrap uses top 32k of base memory. -wfj */ { extern vm_offset_t isaphysmem; isaphysmem = va; virtual_avail = pmap_map(va, firstaddr, firstaddr + DMAPAGES*NBPG, VM_PROT_ALL); } *(int *)CMAP1 = *(int *)CMAP2 = *(int *)PTD = 0; tlbflush(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr, addr2; vm_size_t npg, s; int rv; int i; extern int KPTphys; extern int IdlePTD; /* * Now that kernel map has been allocated, we can mark as * unavailable regions which we have mapped in locore. */ addr = atdevbase; (void) vm_map_find(kernel_map, NULL, (vm_offset_t) 0, &addr, (0x100000-0xa0000), FALSE); addr = (vm_offset_t) KERNBASE + IdlePTD; vm_object_reference(kernel_object); (void) vm_map_find(kernel_map, kernel_object, addr, &addr, (4 + NKPT) * NBPG, FALSE); /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2) ; npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); s = i386_round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return(virt); } /* * Create and return a physical map. * * If the size specified for the map * is zero, the map is an actual physical * map, and may be referenced by the * hardware. * * If the size specified is non-zero, * the map will be used in software only, and * is bounded by that size. * * [ just allocate a ptd and mark it uninitialize -- should we track * with a table which process has which ptd? -wfj ] */ pmap_t pmap_create(size) vm_size_t size; { register pmap_t pmap; /* * Software use map does not need a pmap */ if (size) return(NULL); pmap = (pmap_t) malloc(sizeof *pmap, M_VMPMAP, M_WAITOK); bzero(pmap, sizeof(*pmap)); pmap_pinit(pmap); return (pmap); } struct pmaplist { struct pmaplist *next; }; static inline void * vm_get_pmap() { struct pmaplist *rtval; rtval = (struct pmaplist *)kmem_alloc(kernel_map, ctob(1)); bzero(rtval, ctob(1)); return rtval; } static inline void vm_put_pmap(up) struct pmaplist *up; { kmem_free(kernel_map, (vm_offset_t)up, ctob(1)); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a * valid page directory table. */ pmap->pm_pdir = (pd_entry_t *) vm_get_pmap(); /* wire in kernel global address entries */ bcopy(PTD+KPTDI, pmap->pm_pdir+KPTDI, NKPT*PTESIZE); /* install self-referential address mapping entry */ *(int *)(pmap->pm_pdir+PTDPTDI) = ((int)pmap_kextract((vm_offset_t)pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; simple_lock_init(&pmap->pm_lock); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; simple_lock(&pmap->pm_lock); count = --pmap->pm_count; simple_unlock(&pmap->pm_lock); if (count == 0) { pmap_release(pmap); free((caddr_t)pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { vm_put_pmap((struct pmaplist *) pmap->pm_pdir); } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { simple_lock(&pmap->pm_lock); pmap->pm_count++; simple_unlock(&pmap->pm_lock); } } #define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ int pv_freelistcnt; pv_entry_t pv_freelist; vm_offset_t pvva; int npvvapg; /* * free the pv_entry back to the free list */ inline static void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ while (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; tmp->pv_pmap = 0; tmp->pv_va = 0; tmp->pv_next = 0; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ if (m = vm_page_alloc(kernel_object, pvva-vm_map_min(kernel_map))) { int newentries; int i; pv_entry_t entry; newentries = (NBPG/sizeof (struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ - pmap_enter(vm_map_pmap(kernel_map), pvva, - VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT,1); + pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ pvva += NBPG; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic * if this is too small.) */ npvvapg = ((npg*PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1)/NBPG; pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { pt_entry_t *ptp; vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { ptp=PTmap; /* otherwise, we are alternate address space */ } else { if ( frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; tlbflush(); } ptp=APTmap; } return ptp; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int wired; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { for (npv = pv->pv_next; npv; npv = npv->pv_next) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp,*ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; vm_page_t m; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if( (sva + NBPG) == eva) { if( *pmap_pde( pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if( !*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & (PG_M | PG_U)) { if ((sva < USRSTACK || sva > UPT_MAX_ADDRESS) || (sva >= USRSTACK && sva < USRSTACK+(UPAGES*NBPG))) { if (sva < clean_sva || sva >= clean_eva) { m = PHYS_TO_VM_PAGE(pa); if ((int) oldpte & PG_M) { m->flags &= ~PG_CLEAN; } if ((int) oldpte & PG_U) { m->flags |= PG_REFERENCED; } } } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); pmap_unuse_pt(pmap, sva); } tlbflush(); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. * Note: we assume that the page directory table is * always allocated, and in kernel virtual. */ if ( *pmap_pde(pmap, i386_ptob(sva)) == 0 ) { /* We can race ahead here, straight to next pde.. */ nextpde: sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations * that are much faster than * explicitly scanning when page tables are not fully * populated. */ if ( *ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :"=D"(ptq),"=a"(found) :"c"(nscan),"0"(ptq) :"cx"); if( !found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int)oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. * XXX: should cluster them up and invalidate as many * as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we * may be called at interrupt time). */ pa = ((int)oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { ++sva; continue; } if ((((int) oldpte & PG_M) && (va < USRSTACK || va > UPT_MAX_ADDRESS)) || (va >= USRSTACK && va < USRSTACK+(UPAGES*NBPG))) { if (va < clean_sva || va >= clean_eva ) { m = PHYS_TO_VM_PAGE(pa); m->flags &= ~PG_CLEAN; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } tlbflush(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(pa) vm_offset_t pa; { register pv_entry_t pv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; struct map *map; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ if (!pmap_is_managed(pa)) return; pa = i386_trunc_page(pa); pv = pa_to_pvh(pa); m = PHYS_TO_VM_PAGE(pa); s = splhigh(); while (pv->pv_pmap != NULL) { pmap = pv->pv_pmap; ptp = get_pt_entry(pmap); va = i386_btop(pv->pv_va); pte = ptp + va; if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if ( *pte) { pmap->pm_stats.resident_count--; anyvalid++; /* * update the vm_page_t clean bit */ if ( (m->flags & PG_CLEAN) && ((((int) *pte) & PG_M) && (pv->pv_va < USRSTACK || pv->pv_va > UPT_MAX_ADDRESS)) || (pv->pv_va >= USRSTACK && pv->pv_va < USRSTACK+(UPAGES*NBPG))) { if (pv->pv_va < clean_sva || pv->pv_va >= clean_eva) { m->flags &= ~PG_CLEAN; } } *pte = 0; } pmap_unuse_pt(pmap, pv->pv_va); npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } splx(s); if (anyvalid) tlbflush(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int s; int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found=0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. * Skip it, we don't want to force allocation * of unnecessary PTE pages just to set the protection. */ if (! *pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t)-1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if( *pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :"=D"(pte),"=a"(found) :"c"(nscan),"0"(pte):"cx"); if( !found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if( va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) tlbflush(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int cacheable=1; int ptevalid = 0; if (pmap == NULL) return; va = i386_trunc_page(va); pa = i386_trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS)panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if ( *pmap_pde(pmap, va) == 0) { pg("ptdi %x, va %x", pmap->pm_pdir[PTDPTDI], va); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. * We don't worry about wiring PT pages as they remain * resident as long as there are valid mappings in them. * Hence, if a user page is wired, the PT page will be also. */ if (wired && !pmap_pte_w(pte) || !wired && pmap_pte_w(pte)) { if (wired) pmap->pm_stats.wired_count++; else pmap->pm_stats.wired_count--; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory * Note that we raise IPL while manipulating pv_table * since pmap_enter can be called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. * Place this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); cacheable = 1; } else { cacheable = 0; } pmap_use_pt(pmap, va); /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ( (int) (pa | pte_prot(pmap, prot) | PG_V)); /* * for correctness: */ if( !cacheable) (int) npte |= PG_N; /* * When forking (copy-on-write, etc): * A process will turn off write permissions for any of its writable * pages. If the data (object) is only referred to by one process, the * processes map is modified directly as opposed to using the * object manipulation routine. When using pmap_protect, the * modified bits are not kept in the vm_page_t data structure. * Therefore, when using pmap_enter in vm_fault to bring back * writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. * this clause supports the carryover of the modified and * used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M|PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if(*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) tlbflush(); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for(i=0;ipm_stats.wired_count++; - } - goto validate; - } - - if (opa) { - pmap_remove(kernel_pmap, va, va + PAGE_SIZE); - } - - pv = pa_to_pvh(pa); - s = splhigh(); - /* - * No entries yet, use header as the first entry - */ - if (pv->pv_pmap == NULL) { - pv->pv_va = va; - pv->pv_pmap = kernel_pmap; - pv->pv_next = NULL; - } - /* - * There is at least one other VA mapping this page. - * Place this entry after the header. - */ - else { - npv = get_pv_entry(); - npv->pv_va = va; - npv->pv_pmap = kernel_pmap; - npv->pv_next = pv->pv_next; - pv->pv_next = npv; - } - splx(s); - - /* - * Increment counters - */ - kernel_pmap->pm_stats.resident_count++; - -validate: - - /* - * Now validate mapping with desired protection/wiring. - */ - *pte = (pt_entry_t) ( (int) (pa | PG_RW | PG_V | PG_W)); + *pte = (pt_entry_t) 0; + tlbflush(); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static inline int pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; int anyvalid = 0; /* * Enter on the PV list if part of our managed memory * Note that we raise IPL while manipulating pv_table * since pmap_enter can be called at interrupt time. */ pte = vtopte(va); if (pmap_pte_pa(pte)) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. * Place this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); pmap_use_pt(pmap, va); /* * Increment counters */ pmap->pm_stats.resident_count++; validate: if (*pte) anyvalid++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ( (int) (pa | PG_V | PG_u)); return (anyvalid); } /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, offset, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_offset_t offset; vm_offset_t size; { vm_offset_t tmpoff; vm_page_t p; int s; vm_offset_t v, lastv=0; pt_entry_t pte; extern vm_map_t kernel_map; vm_offset_t objbytes; int anyvalid = 0; if (!pmap) return; /* * if we are processing a major portion of the object, then * scan the entire thing. */ if( size > object->size / 2) { objbytes = size; p = object->memq.tqh_first; while ((p != NULL) && (objbytes != 0)) { tmpoff = p->offset; if( tmpoff < offset) { p = p->listq.tqe_next; continue; } tmpoff -= offset; if( tmpoff >= size) { p = p->listq.tqe_next; continue; } if ((p->flags & (PG_BUSY|PG_FICTITIOUS)) == 0 ) { vm_page_hold(p); v = i386_trunc_page(((vm_offset_t)vtopte( addr+tmpoff))); /* a fault might occur here */ *(volatile char *)v += 0; vm_page_unhold(p); anyvalid += pmap_enter_quick(pmap, addr+tmpoff, VM_PAGE_TO_PHYS(p)); } p = p->listq.tqe_next; objbytes -= NBPG; } } else { /* * else lookup the pages one-by-one. */ for(tmpoff = 0; tmpoff < size; tmpoff += NBPG) { if( p = vm_page_lookup(object, tmpoff + offset)) { if( (p->flags & (PG_BUSY|PG_FICTITIOUS)) == 0) { vm_page_hold(p); v = i386_trunc_page(((vm_offset_t)vtopte( addr+tmpoff))); /* a fault might occur here */ *(volatile char *)v += 0; vm_page_unhold(p); anyvalid += pmap_enter_quick(pmap, addr+tmpoff, VM_PAGE_TO_PHYS(p)); } } } } if (anyvalid) tlbflush(); } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte) || !wired && pmap_pte_w(pte)) { if (wired) pmap->pm_stats.wired_count++; else pmap->pm_stats.wired_count--; } /* * Wiring is not a hardware characteristic so there is no need * to invalidate TLB. */ pmap_pte_set_w(pte, wired); /* * When unwiring, set the modified bit in the pte -- could have * been changed by the kernel */ if (!wired) (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Require that all active physical maps contain no * incorrect entries NOW. [This update includes * forcing updates of any address map caching.] * * Generally used to insure that a thread about * to run will see a semantically correct world. */ void pmap_update() { tlbflush(); } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *)CMAP2) panic("pmap_zero_page: CMAP busy"); *(int *)CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); bzero(CADDR2,NBPG); *(int *)CMAP2 = 0; tlbflush(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *)CMAP1 || *(int *)CMAP2) panic("pmap_copy_page: CMAP busy"); *(int *)CMAP1 = PG_V | PG_KW | i386_trunc_page(src); *(int *)CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); #if __GNUC__ > 1 memcpy(CADDR2, CADDR1, NBPG); #else bcopy(CADDR1, CADDR2, NBPG); #endif *(int *)CMAP1 = 0; *(int *)CMAP2 = 0; tlbflush(); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning * immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return(FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning * immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, * then mark UPAGES as always modified, and * ptes as never modified. */ if (bit & PG_U ) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } if (bit & PG_M ) { if (pv->pv_va >= USRSTACK) { if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { continue; } if (pv->pv_va < USRSTACK+(UPAGES*NBPG)) { splx(s); return TRUE; } else if (pv->pv_va < UPT_MAX_ADDRESS) { splx(s); return FALSE; } } } + if( !pv->pv_pmap) { + printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); + continue; + } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return(FALSE); } /* * this routine is used to modify bits in ptes */ static inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos * If setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } + if( !pv->pv_pmap) { + printf("Null pmap (cb) at va: 0x%lx\n", va); + continue; + } pte = pmap_pte(pv->pv_pmap, va); if (setem) (int) npte = (int) *pte | bit; else (int) npte = (int) *pte & ~bit; *pte = npte; } } splx(s); tlbflush(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(pa) vm_offset_t pa; { pmap_changebit(pa, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(pa) vm_offset_t pa; { pmap_changebit(pa, PG_U, FALSE); } /* * pmap_is_referenced: * * Return whether or not the specified physical page is referenced * by any physical maps. */ boolean_t pmap_is_referenced(pa) vm_offset_t pa; { return(pmap_testbit(pa, PG_U)); } /* * pmap_is_modified: * * Return whether or not the specified physical page is modified * by any physical maps. */ boolean_t pmap_is_modified(pa) vm_offset_t pa; { return(pmap_testbit(pa, PG_M)); } /* * Routine: pmap_copy_on_write * Function: * Remove write privileges from all * physical maps for this physical page. */ void pmap_copy_on_write(pa) vm_offset_t pa; { pmap_changebit(pa, PG_RW, FALSE); } vm_offset_t pmap_phys_address(ppn) int ppn; { return(i386_ptob(ppn)); } /* * Miscellaneous support routines follow */ void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute * bit, so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } #ifdef DEBUG /* print address space of pmap*/ void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024 ; j++) { va = (i< UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *)ptep); } ; } void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/kern/kern_physio.c =================================================================== --- head/sys/kern/kern_physio.c (revision 1886) +++ head/sys/kern/kern_physio.c (revision 1887) @@ -1,173 +1,183 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. * - * $Id$ + * $Id: kern_physio.c,v 1.3 1994/08/02 07:42:05 davidg Exp $ */ #include #include #include #include #include #include static void physwakeup(); int physio(strategy, bp, dev, rw, minp, uio) int (*strategy)(); struct buf *bp; dev_t dev; int rw; u_int (*minp)(); struct uio *uio; { int i; - int bp_alloc = (bp == 0); int bufflags = rw?B_READ:0; int error; int spl; + caddr_t sa; + int bp_alloc = (bp == 0); + struct buf *bpa; /* * keep the process from being swapped */ curproc->p_flag |= P_PHYSIO; /* create and build a buffer header for a transfer */ - - if (bp_alloc) { - bp = (struct buf *)getpbuf(); - } else { + bpa = (struct buf *)getpbuf(); + if (!bp_alloc) { spl = splbio(); while (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; tsleep((caddr_t)bp, PRIBIO, "physbw", 0); } bp->b_flags |= B_BUSY; splx(spl); + } else { + bp = bpa; } + /* + * get a copy of the kva from the physical buffer + */ + sa = bpa->b_data; bp->b_proc = curproc; bp->b_dev = dev; error = bp->b_error = 0; for(i=0;iuio_iovcnt;i++) { while( uio->uio_iov[i].iov_len) { vm_offset_t v, lastv, pa; caddr_t adr; bp->b_bcount = uio->uio_iov[i].iov_len; bp->b_bufsize = bp->b_bcount; bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags; bp->b_iodone = physwakeup; bp->b_data = uio->uio_iov[i].iov_base; + /* + * pass in the kva from the physical buffer + * for the temporary kernel mapping. + */ + bp->b_saveaddr = sa; bp->b_blkno = btodb(uio->uio_offset); if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { error = EFAULT; goto doerror; } if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { error = EFAULT; goto doerror; } vmapbuf(bp); /* perform transfer */ (*strategy)(bp); spl = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep((caddr_t)bp, PRIBIO, "physstr", 0); splx(spl); vunmapbuf(bp); /* * update the uio data */ { int iolen = bp->b_bcount - bp->b_resid; uio->uio_iov[i].iov_len -= iolen; uio->uio_iov[i].iov_base += iolen; uio->uio_resid -= iolen; uio->uio_offset += iolen; } /* * check for an error */ if( bp->b_flags & B_ERROR) { error = bp->b_error; goto doerror; } } } doerror: - if (bp_alloc) { - relpbuf(bp); - } else { + relpbuf(bpa); + if (!bp_alloc) { bp->b_flags &= ~(B_BUSY|B_PHYS); if( bp->b_flags & B_WANTED) { bp->b_flags &= ~B_WANTED; wakeup((caddr_t)bp); } } /* * allow the process to be swapped */ curproc->p_flag &= ~P_PHYSIO; return (error); } u_int minphys(struct buf *bp) { if( bp->b_bcount > MAXBSIZE) { bp->b_bcount = MAXBSIZE; } return bp->b_bcount; } int rawread(dev_t dev, struct uio *uio) { return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, dev, 1, minphys, uio)); } int rawwrite(dev_t dev, struct uio *uio) { return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, dev, 0, minphys, uio)); } static void physwakeup(bp) struct buf *bp; { wakeup((caddr_t) bp); bp->b_flags &= ~B_CALL; } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 1886) +++ head/sys/kern/vfs_bio.c (revision 1887) @@ -1,716 +1,717 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: vfs_bio.c,v 1.4 1994/08/02 07:43:13 davidg Exp $ + * $Id: vfs_bio.c,v 1.5 1994/08/04 19:43:13 davidg Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include struct buf *buf; /* buffer header pool */ int nbuf; /* number of buffer headers calculated elsewhere */ extern vm_map_t buffer_map, io_map; void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); int needsbuffer; /* * Internal update daemon, process 3 * The variable vfs_update_wakeup allows for internal syncs. */ int vfs_update_wakeup; /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); /* first, make a null hash table */ for(i=0;ib_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } } /* * remove the buffer from the appropriate free list */ void bremfree(struct buf *bp) { int s = splbio(); if( bp->b_qindex != QUEUE_NONE) { TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { panic("bremfree: removing a buffer when not on a queue"); } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. */ int bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, struct buf **bpp) { struct buf *bp; bp = getblk (vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) /* count block I/O */ curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); if( bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } VOP_STRATEGY(bp); return( biowait (bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, struct buf **bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk (vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) /* count block I/O */ curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); if( bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } VOP_STRATEGY(bp); ++readwait; } for(i=0;ib_flags & B_CACHE) == 0) { if (curproc && curproc->p_stats) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); if( rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } VOP_STRATEGY(rabp); } else { brelse(rabp); } } if( readwait) { rv = biowait (bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async.) */ int bwrite(struct buf *bp) { int oldflags = bp->b_flags; if(bp->b_flags & B_INVAL) { brelse(bp); return (0); } if(!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); bp->b_flags |= B_WRITEINPROG; if (oldflags & B_ASYNC) { if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } else if( curproc) { ++curproc->p_stats->p_ru.ru_oublock; } } bp->b_vp->v_numoutput++; VOP_STRATEGY(bp); if( (oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } else if( curproc) { ++curproc->p_stats->p_ru.ru_oublock; } brelse(bp); return (rtval); } return(0); } int vn_bwrite(ap) struct vop_bwrite_args *ap; { return (bwrite(ap->a_bp)); } /* * Delayed write. (Buffer is marked dirty). */ void bdwrite(struct buf *bp) { if((bp->b_flags & B_BUSY) == 0) { panic("bdwrite: buffer is not busy"); } if(bp->b_flags & B_INVAL) { brelse(bp); return; } if(bp->b_flags & B_TAPE) { bawrite(bp); return; } bp->b_flags &= ~B_READ; if( (bp->b_flags & B_DELWRI) == 0) { if( curproc) ++curproc->p_stats->p_ru.ru_oublock; bp->b_flags |= B_DONE|B_DELWRI; reassignbuf(bp, bp->b_vp); } brelse(bp); return; } /* * Asynchronous write. * Start output on a buffer, but do not wait for it to complete. * The buffer is released when the output completes. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * Release a buffer. */ void brelse(struct buf *bp) { int x; /* anyone need a "free" block? */ x=splbio(); if (needsbuffer) { needsbuffer = 0; wakeup((caddr_t)&needsbuffer); } - /* anyone need this very block? */ + + /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_WANTED|B_AGE); wakeup((caddr_t)bp); } if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || (bp->b_bufsize <= 0)) { bp->b_flags |= B_INVAL; bp->b_flags &= ~(B_DELWRI|B_CACHE); if(bp->b_vp) brelvp(bp); } if( bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* enqueue */ - /* buffers with junk contents */ + /* buffers with no memory */ if(bp->b_bufsize == 0) { bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; + /* buffers with junk contents */ } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if(bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if(bp->b_flags & B_AGE) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); /* buffers with valid and quite potentially reuseable contents */ } else { bp->b_qindex = QUEUE_LRU; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } /* unlock */ bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); splx(x); } int freebufspace; int allocbufspace; /* * Find a buffer header which is available for use. */ struct buf * getnewbuf(int slpflag, int slptimeo) { struct buf *bp; - int x; - x = splbio(); + int s; + s = splbio(); start: /* can we constitute a new buffer? */ if (bp = bufqueues[QUEUE_EMPTY].tqh_first) { if( bp->b_qindex != QUEUE_EMPTY) panic("getnewbuf: inconsistent EMPTY queue"); bremfree(bp); goto fillbuf; } tryfree: if (bp = bufqueues[QUEUE_AGE].tqh_first) { if( bp->b_qindex != QUEUE_AGE) panic("getnewbuf: inconsistent AGE queue"); bremfree(bp); } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { if( bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue"); bremfree(bp); } else { /* wait for a free buffer of any kind */ needsbuffer = 1; tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); - splx(x); + splx(s); return (0); } /* if we are a delayed write, convert to an async write */ if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_BUSY; bawrite (bp); goto start; } if(bp->b_vp) brelvp(bp); /* we are not free, nor do we contain interesting data */ if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); fillbuf: bp->b_flags = B_BUSY; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); - splx(x); + splx(s); bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_wcred = bp->b_rcred = NOCRED; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; return (bp); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode *vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; int s = splbio(); bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp) { if( (bp < buf) || (bp >= buf + nbuf)) { printf("incore: buf out of range: %lx, hash: %d\n", bp, bh - bufhashtbl); panic("incore: buf fault"); } /* hit */ if (bp->b_lblkno == blkno && bp->b_vp == vp && (bp->b_flags & B_INVAL) == 0) { splx(s); return (bp); } bp = bp->b_hash.le_next; } splx(s); return(0); } /* * Get a block given a specified block and offset into a file/device. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; - int x; + int s; struct bufhashhdr *bh; - x = splbio(); + s = splbio(); loop: if (bp = incore(vp, blkno)) { if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); goto loop; } bp->b_flags |= B_BUSY | B_CACHE; bremfree(bp); /* * check for size inconsistancies */ if (bp->b_bcount != size) { printf("getblk: invalid buffer size: %d\n", bp->b_bcount); bp->b_flags |= B_INVAL; bwrite(bp); goto loop; } } else { if ((bp = getnewbuf(0, 0)) == 0) goto loop; allocbuf(bp, size); /* * have to check again, because of a possible * race condition. */ if (incore( vp, blkno)) { allocbuf(bp, 0); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } bp->b_blkno = bp->b_lblkno = blkno; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); } - splx(x); + splx(s); return (bp); } /* * Get an empty, disassociated buffer of given size. */ struct buf * geteblk(int size) { struct buf *bp; while ((bp = getnewbuf(0, 0)) == 0) ; allocbuf(bp, size); bp->b_flags |= B_INVAL; return (bp); } /* * Modify the length of a buffer's underlying buffer storage without * destroying information (unless, of course the buffer is shrinking). */ void allocbuf(struct buf *bp, int size) { int newbsize = round_page(size); if( newbsize == bp->b_bufsize) { bp->b_bcount = size; return; } else if( newbsize < bp->b_bufsize) { vm_hold_free_pages( (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if( newbsize > bp->b_bufsize) { vm_hold_load_pages( (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); } /* adjust buffer cache's idea of memory allocated to buffer contents */ freebufspace -= newbsize - bp->b_bufsize; allocbufspace += newbsize - bp->b_bufsize; bp->b_bufsize = newbsize; bp->b_bcount = size; } /* * Wait for buffer I/O completion, returning error status. */ int biowait(register struct buf *bp) { - int x; + int s; - x = splbio(); + s = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep((caddr_t)bp, PRIBIO, "biowait", 0); if((bp->b_flags & B_ERROR) || bp->b_error) { if ((bp->b_flags & B_INVAL) == 0) { bp->b_flags |= B_INVAL; bp->b_dev = NODEV; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } if (!bp->b_error) bp->b_error = EIO; else bp->b_flags |= B_ERROR; - splx(x); + splx(s); return (bp->b_error); } else { - splx(x); + splx(s); return (0); } } /* * Finish I/O on a buffer, calling an optional function. * This is usually called from interrupt level, so process blocking * is not *a good idea*. */ void biodone(register struct buf *bp) { int s; s = splbio(); bp->b_flags |= B_DONE; if ((bp->b_flags & B_READ) == 0) { vwakeup(bp); } + if (bp->b_flags & B_BOUNCE) + vm_bounce_free(bp); + /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone)(bp); splx(s); return; } /* * For asynchronous completions, release the buffer now. The brelse * checks for B_WANTED and will do the wakeup there if necessary - * so no need to do a wakeup here in the async case. */ if (bp->b_flags & B_ASYNC) { brelse(bp); } else { bp->b_flags &= ~B_WANTED; wakeup((caddr_t) bp); } splx(s); } int count_lock_queue() { int count; struct buf *bp; count = 0; for(bp = bufqueues[QUEUE_LOCKED].tqh_first; bp != NULL; bp = bp->b_freelist.tqe_next) count++; return(count); } #ifndef UPDATE_INTERVAL int vfs_update_interval = 30; #else int vfs_update_interval = UPDATE_INTERVAL; #endif void vfs_update() { (void) spl0(); while(1) { tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", hz * vfs_update_interval); vfs_update_wakeup = 0; sync(curproc, NULL, NULL); } } /* * these routines are not in the correct place (yet) * also they work *ONLY* for kernel_pmap!!! */ void vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for(pg = from ; pg < to ; pg += PAGE_SIZE) { vm_offset_t pa; tryagain: if (cnt.v_free_count <= cnt.v_free_reserved) { VM_WAIT; goto tryagain; } + p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); if( !p) { VM_WAIT; goto tryagain; } vm_page_wire(p); - pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p), - VM_PROT_READ|VM_PROT_WRITE, 1); + pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); } + pmap_update(); } void vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { vm_offset_t pg; vm_page_t p; vm_offset_t from = round_page(froma); vm_offset_t to = round_page(toa); for(pg = from ; pg < to ; pg += PAGE_SIZE) { - vm_offset_t pa; - pa = pmap_kextract(pg); - if( !pa) { - printf("No pa for va: %x\n", pg); - } else { - p = PHYS_TO_VM_PAGE( pa); - pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE); - vm_page_free(p); - } + p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); + pmap_kremove( pg); + vm_page_free(p); } + pmap_update(); } void bufstats() { } Index: head/sys/sys/bio.h =================================================================== --- head/sys/sys/bio.h (revision 1886) +++ head/sys/sys/bio.h (revision 1887) @@ -1,214 +1,213 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.7 (Berkeley) 1/21/94 - * $Id$ + * $Id: buf.h,v 1.4 1994/08/02 07:52:39 davidg Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #define NOLIST ((struct buf *)0x87654321) /* * The buffer header describes an I/O operation in the kernel. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ struct buf *b_actf, **b_actb; /* Device driver queue when active. */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ volatile long b_flags; /* B_* flags. */ int b_qindex; /* buffer queue index */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ struct { caddr_t b_addr; /* Memory, superblocks, indirect etc. */ } b_un; void *b_saveaddr; /* Original b_addr for physio. */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); struct vnode *b_vp; /* Device vnode. */ int b_pfcent; /* Center page when swapping cluster. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ daddr_t b_pblkno; /* physical block number */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ - TAILQ_HEAD(b_clusterhd,buf) b_cluster; /* low level clustering */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_spc; }; /* Device driver compatibility definitions. */ #define b_active b_bcount /* Driver queue head: drive active. */ #define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ #define b_errcnt b_resid /* Retry count while I/O in progress. */ #define iodone biodone /* Old name for biodone. */ #define iowait biowait /* Old name for biowait. */ /* * These flags are kept in b_flags. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_BUSY 0x00000010 /* I/O in progress. */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_GATHERED 0x00001000 /* LFS: already in a segment. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_PAGET 0x00010000 /* Page in/out of page table space. */ #define B_PGIN 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_TAPE 0x00200000 /* Magnetic tape I/O. */ #define B_UAREA 0x00400000 /* Buffer describes Uarea I/O. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_BOUNCE 0x80000000 /* bounce buffer flag */ /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((int)(vnp) / sizeof(struct vnode))+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 5 /* number of free buffer queues */ LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ #define QUEUE_LRU 2 /* useful buffers */ #define QUEUE_AGE 3 /* less useful buffers */ #define QUEUE_EMPTY 4 /* empty buffer headers*/ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ blkclr((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL int nbuf; /* The number of buffer headers */ struct buf *buf; /* The buffer headers. */ char *buffers; /* The buffer contents. */ int bufpages; /* Number of memory pages in the buffer pool. */ struct buf *swbuf; /* Swap I/O buffer headers. */ int nswbuf; /* Number of swap I/O buffer headers. */ TAILQ_HEAD(swqueue, buf) bswlist; struct buf *bclnlist; /* Head of cleaned page list. */ __BEGIN_DECLS void allocbuf __P((struct buf *, int)); void bawrite __P((struct buf *)); void bdwrite __P((struct buf *)); void biodone __P((struct buf *)); int biowait __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); void brelse __P((struct buf *)); void bufinit __P((void)); int bwrite __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, struct buf **)); void cluster_write __P((struct buf *, u_quad_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); struct buf *getnewbuf __P((int slpflag, int slptimeo)); struct buf *incore __P((struct vnode *, daddr_t)); u_int minphys __P((struct buf *bp)); __END_DECLS #endif #endif /* !_SYS_BUF_H_ */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 1886) +++ head/sys/sys/buf.h (revision 1887) @@ -1,214 +1,213 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.7 (Berkeley) 1/21/94 - * $Id$ + * $Id: buf.h,v 1.4 1994/08/02 07:52:39 davidg Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #define NOLIST ((struct buf *)0x87654321) /* * The buffer header describes an I/O operation in the kernel. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ struct buf *b_actf, **b_actb; /* Device driver queue when active. */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ volatile long b_flags; /* B_* flags. */ int b_qindex; /* buffer queue index */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ struct { caddr_t b_addr; /* Memory, superblocks, indirect etc. */ } b_un; void *b_saveaddr; /* Original b_addr for physio. */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); struct vnode *b_vp; /* Device vnode. */ int b_pfcent; /* Center page when swapping cluster. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ daddr_t b_pblkno; /* physical block number */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ - TAILQ_HEAD(b_clusterhd,buf) b_cluster; /* low level clustering */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_spc; }; /* Device driver compatibility definitions. */ #define b_active b_bcount /* Driver queue head: drive active. */ #define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ #define b_errcnt b_resid /* Retry count while I/O in progress. */ #define iodone biodone /* Old name for biodone. */ #define iowait biowait /* Old name for biowait. */ /* * These flags are kept in b_flags. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_BUSY 0x00000010 /* I/O in progress. */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_GATHERED 0x00001000 /* LFS: already in a segment. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_PAGET 0x00010000 /* Page in/out of page table space. */ #define B_PGIN 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_TAPE 0x00200000 /* Magnetic tape I/O. */ #define B_UAREA 0x00400000 /* Buffer describes Uarea I/O. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_BOUNCE 0x80000000 /* bounce buffer flag */ /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((int)(vnp) / sizeof(struct vnode))+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 5 /* number of free buffer queues */ LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ #define QUEUE_LRU 2 /* useful buffers */ #define QUEUE_AGE 3 /* less useful buffers */ #define QUEUE_EMPTY 4 /* empty buffer headers*/ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ blkclr((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL int nbuf; /* The number of buffer headers */ struct buf *buf; /* The buffer headers. */ char *buffers; /* The buffer contents. */ int bufpages; /* Number of memory pages in the buffer pool. */ struct buf *swbuf; /* Swap I/O buffer headers. */ int nswbuf; /* Number of swap I/O buffer headers. */ TAILQ_HEAD(swqueue, buf) bswlist; struct buf *bclnlist; /* Head of cleaned page list. */ __BEGIN_DECLS void allocbuf __P((struct buf *, int)); void bawrite __P((struct buf *)); void bdwrite __P((struct buf *)); void biodone __P((struct buf *)); int biowait __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); void brelse __P((struct buf *)); void bufinit __P((void)); int bwrite __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, struct buf **)); void cluster_write __P((struct buf *, u_quad_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); struct buf *getnewbuf __P((int slpflag, int slptimeo)); struct buf *incore __P((struct vnode *, daddr_t)); u_int minphys __P((struct buf *bp)); __END_DECLS #endif #endif /* !_SYS_BUF_H_ */ Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 1886) +++ head/sys/vm/swap_pager.c (revision 1887) @@ -1,1834 +1,1723 @@ /* * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 - * $Id$ + * $Id: swap_pager.c,v 1.4 1994/08/02 07:55:13 davidg Exp $ */ /* * Quick hack to page to dedicated partition(s). * TODO: * Add multiprocessor locks * Deal with async writes in a better fashion */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NPENDINGIO #define NPENDINGIO 16 #endif extern int nswbuf; int nswiodone; extern int vm_pageout_rate_limit; static int cleandone; extern int hz; int swap_pager_full; extern vm_map_t pager_map; extern int vm_pageout_pages_needed; extern int vm_swap_size; extern struct vnode *swapdev_vp; #define MAX_PAGEOUT_CLUSTER 8 TAILQ_HEAD(swpclean, swpagerclean); typedef struct swpagerclean *swp_clean_t; struct swpagerclean { TAILQ_ENTRY(swpagerclean) spc_list; int spc_flags; struct buf *spc_bp; sw_pager_t spc_swp; vm_offset_t spc_kva; vm_offset_t spc_altkva; int spc_count; vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; } swcleanlist [NPENDINGIO] ; extern vm_map_t kernel_map; /* spc_flags values */ #define SPC_ERROR 0x01 #define SWB_EMPTY (-1) void swap_pager_init(void); vm_pager_t swap_pager_alloc(caddr_t, vm_size_t, vm_prot_t, vm_offset_t); void swap_pager_dealloc(vm_pager_t); boolean_t swap_pager_getpage(vm_pager_t, vm_page_t, boolean_t); boolean_t swap_pager_putpage(vm_pager_t, vm_page_t, boolean_t); boolean_t swap_pager_getmulti(vm_pager_t, vm_page_t *, int, int, boolean_t); boolean_t swap_pager_haspage(vm_pager_t, vm_offset_t); int swap_pager_io(sw_pager_t, vm_page_t *, int, int, int); void swap_pager_iodone(struct buf *); boolean_t swap_pager_clean(); extern struct pagerops swappagerops; struct swpclean swap_pager_done; /* list of compileted page cleans */ struct swpclean swap_pager_inuse; /* list of pending page cleans */ struct swpclean swap_pager_free; /* list of free pager clean structs */ struct pagerlst swap_pager_list; /* list of "named" anon regions */ struct pagerlst swap_pager_un_list; /* list of "unnamed" anon pagers */ #define SWAP_FREE_NEEDED 0x1 /* need a swap block */ int swap_pager_needflags; struct rlist *swapfrag; struct pagerlst *swp_qs[]={ &swap_pager_list, &swap_pager_un_list, (struct pagerlst *) 0 }; int swap_pager_putmulti(); struct pagerops swappagerops = { swap_pager_init, swap_pager_alloc, swap_pager_dealloc, swap_pager_getpage, swap_pager_getmulti, swap_pager_putpage, swap_pager_putmulti, swap_pager_haspage }; extern int nswbuf; int npendingio = NPENDINGIO; int pendingiowait; int require_swap_init; void swap_pager_finish(); int dmmin, dmmax; extern int vm_page_count; struct buf * getpbuf() ; void relpbuf(struct buf *bp) ; static inline void swapsizecheck() { if( vm_swap_size < 128*btodb(PAGE_SIZE)) { if( swap_pager_full) printf("swap_pager: out of space\n"); swap_pager_full = 1; } else if( vm_swap_size > 192*btodb(PAGE_SIZE)) swap_pager_full = 0; } void swap_pager_init() { extern int dmmin, dmmax; dfltpagerops = &swappagerops; TAILQ_INIT(&swap_pager_list); TAILQ_INIT(&swap_pager_un_list); /* * Initialize clean lists */ TAILQ_INIT(&swap_pager_inuse); TAILQ_INIT(&swap_pager_done); TAILQ_INIT(&swap_pager_free); require_swap_init = 1; /* * Calculate the swap allocation constants. */ dmmin = CLBYTES/DEV_BSIZE; dmmax = btodb(SWB_NPAGES*PAGE_SIZE)*2; } /* * Allocate a pager structure and associated resources. * Note that if we are called from the pageout daemon (handle == NULL) * we should not wait for memory as it could resulting in deadlock. */ vm_pager_t swap_pager_alloc(handle, size, prot, offset) caddr_t handle; register vm_size_t size; vm_prot_t prot; vm_offset_t offset; { register vm_pager_t pager; register sw_pager_t swp; int waitok; int i,j; if (require_swap_init) { swp_clean_t spc; struct buf *bp; /* * kva's are allocated here so that we dont need to keep * doing kmem_alloc pageables at runtime */ for (i = 0, spc = swcleanlist; i < npendingio ; i++, spc++) { spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE); if (!spc->spc_kva) { break; } spc->spc_bp = malloc( sizeof( *bp), M_TEMP, M_NOWAIT); if (!spc->spc_bp) { kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); break; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); } require_swap_init = 0; if( size == 0) return(NULL); } /* * If this is a "named" anonymous region, look it up and * return the appropriate pager if it exists. */ if (handle) { pager = vm_pager_lookup(&swap_pager_list, handle); if (pager != NULL) { /* * Use vm_object_lookup to gain a reference * to the object and also to remove from the * object cache. */ if (vm_object_lookup(pager) == NULL) panic("swap_pager_alloc: bad object"); return(pager); } } if (swap_pager_full) { return(NULL); } /* * Pager doesn't exist, allocate swap management resources * and initialize. */ waitok = handle ? M_WAITOK : M_NOWAIT; pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok); if (pager == NULL) return(NULL); swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok); if (swp == NULL) { free((caddr_t)pager, M_VMPAGER); return(NULL); } size = round_page(size); swp->sw_osize = size; swp->sw_nblocks = (btodb(size) + btodb(SWB_NPAGES * PAGE_SIZE) - 1) / btodb(SWB_NPAGES*PAGE_SIZE); swp->sw_blocks = (sw_blk_t) malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks), M_VMPGDATA, waitok); if (swp->sw_blocks == NULL) { free((caddr_t)swp, M_VMPGDATA); free((caddr_t)pager, M_VMPAGER); return(NULL); } for (i = 0; i < swp->sw_nblocks; i++) { swp->sw_blocks[i].swb_valid = 0; swp->sw_blocks[i].swb_locked = 0; for (j = 0; j < SWB_NPAGES; j++) swp->sw_blocks[i].swb_block[j] = SWB_EMPTY; } swp->sw_poip = 0; if (handle) { vm_object_t object; swp->sw_flags = SW_NAMED; TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list); /* * Consistant with other pagers: return with object * referenced. Can't do this with handle == NULL * since it might be the pageout daemon calling. */ object = vm_object_allocate(size); vm_object_enter(object, pager); vm_object_setpager(object, pager, 0, FALSE); } else { swp->sw_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_un_list, pager, pg_list); } pager->pg_handle = handle; pager->pg_ops = &swappagerops; pager->pg_type = PG_SWAP; pager->pg_data = (caddr_t)swp; return(pager); } /* * returns disk block associated with pager and offset * additionally, as a side effect returns a flag indicating * if the block has been written */ static int * swap_pager_diskaddr(swp, offset, valid) sw_pager_t swp; vm_offset_t offset; int *valid; { register sw_blk_t swb; int ix; if (valid) *valid = 0; ix = offset / (SWB_NPAGES*PAGE_SIZE); if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { return(FALSE); } swb = &swp->sw_blocks[ix]; ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; if (valid) *valid = swb->swb_valid & (1<swb_block[ix]; } /* * Utility routine to set the valid (written) bit for * a block associated with a pager and offset */ static void swap_pager_setvalid(swp, offset, valid) sw_pager_t swp; vm_offset_t offset; int valid; { register sw_blk_t swb; int ix; ix = offset / (SWB_NPAGES*PAGE_SIZE); if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) return; swb = &swp->sw_blocks[ix]; ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; if (valid) swb->swb_valid |= (1 << ix); else swb->swb_valid &= ~(1 << ix); return; } /* * this routine allocates swap space with a fragmentation * minimization policy. */ int swap_pager_getswapspace( unsigned amount, unsigned *rtval) { unsigned tmpalloc; unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); if( amount < nblocksfrag) { if( rlist_alloc(&swapfrag, amount, rtval)) return 1; if( !rlist_alloc(&swapmap, nblocksfrag, &tmpalloc)) return 0; rlist_free( &swapfrag, tmpalloc+amount, tmpalloc + nblocksfrag - 1); *rtval = tmpalloc; return 1; } if( !rlist_alloc(&swapmap, amount, rtval)) return 0; else return 1; } /* * this routine frees swap space with a fragmentation * minimization policy. */ void swap_pager_freeswapspace( unsigned from, unsigned to) { unsigned nblocksfrag = btodb(SWB_NPAGES*PAGE_SIZE); unsigned tmpalloc; if( ((to + 1) - from) >= nblocksfrag) { while( (from + nblocksfrag) <= to + 1) { rlist_free(&swapmap, from, from + nblocksfrag - 1); from += nblocksfrag; } } if( from >= to) return; rlist_free(&swapfrag, from, to); while( rlist_alloc(&swapfrag, nblocksfrag, &tmpalloc)) { rlist_free(&swapmap, tmpalloc, tmpalloc + nblocksfrag-1); } } /* * this routine frees swap blocks from a specified pager */ void _swap_pager_freespace(swp, start, size) sw_pager_t swp; vm_offset_t start; vm_offset_t size; { vm_offset_t i; int s; s = splbio(); for (i = start; i < round_page(start + size - 1); i += PAGE_SIZE) { int valid; int *addr = swap_pager_diskaddr(swp, i, &valid); if (addr && *addr != SWB_EMPTY) { swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); if( valid) { vm_swap_size += btodb(PAGE_SIZE); swap_pager_setvalid(swp, i, 0); } *addr = SWB_EMPTY; } } swapsizecheck(); splx(s); } void swap_pager_freespace(pager, start, size) vm_pager_t pager; vm_offset_t start; vm_offset_t size; { _swap_pager_freespace((sw_pager_t) pager->pg_data, start, size); } /* * swap_pager_reclaim frees up over-allocated space from all pagers * this eliminates internal fragmentation due to allocation of space * for segments that are never swapped to. It has been written so that * it does not block until the rlist_free operation occurs; it keeps * the queues consistant. */ /* * Maximum number of blocks (pages) to reclaim per pass */ #define MAXRECLAIM 256 void swap_pager_reclaim() { vm_pager_t p; sw_pager_t swp; int i, j, k; int s; int reclaimcount; static int reclaims[MAXRECLAIM]; static int in_reclaim; /* * allow only one process to be in the swap_pager_reclaim subroutine */ s = splbio(); if (in_reclaim) { tsleep((caddr_t) &in_reclaim, PSWP, "swrclm", 0); splx(s); return; } in_reclaim = 1; reclaimcount = 0; /* for each pager queue */ for (k = 0; swp_qs[k]; k++) { p = swp_qs[k]->tqh_first; while (p && (reclaimcount < MAXRECLAIM)) { /* * see if any blocks associated with a pager has been * allocated but not used (written) */ swp = (sw_pager_t) p->pg_data; for (i = 0; i < swp->sw_nblocks; i++) { sw_blk_t swb = &swp->sw_blocks[i]; if( swb->swb_locked) continue; for (j = 0; j < SWB_NPAGES; j++) { if (swb->swb_block[j] != SWB_EMPTY && (swb->swb_valid & (1 << j)) == 0) { reclaims[reclaimcount++] = swb->swb_block[j]; swb->swb_block[j] = SWB_EMPTY; if (reclaimcount >= MAXRECLAIM) goto rfinished; } } } p = p->pg_list.tqe_next; } } rfinished: /* * free the blocks that have been added to the reclaim list */ for (i = 0; i < reclaimcount; i++) { swap_pager_freeswapspace(reclaims[i], reclaims[i]+btodb(PAGE_SIZE) - 1); swapsizecheck(); wakeup((caddr_t) &in_reclaim); } splx(s); in_reclaim = 0; wakeup((caddr_t) &in_reclaim); } /* * swap_pager_copy copies blocks from one pager to another and * destroys the source pager */ void swap_pager_copy(srcpager, srcoffset, dstpager, dstoffset, offset) vm_pager_t srcpager; vm_offset_t srcoffset; vm_pager_t dstpager; vm_offset_t dstoffset; vm_offset_t offset; { sw_pager_t srcswp, dstswp; vm_offset_t i; int s; srcswp = (sw_pager_t) srcpager->pg_data; dstswp = (sw_pager_t) dstpager->pg_data; /* * remove the source pager from the swap_pager internal queue */ s = splbio(); if (srcswp->sw_flags & SW_NAMED) { TAILQ_REMOVE(&swap_pager_list, srcpager, pg_list); srcswp->sw_flags &= ~SW_NAMED; } else { TAILQ_REMOVE(&swap_pager_un_list, srcpager, pg_list); } while (srcswp->sw_poip) { tsleep((caddr_t)srcswp, PVM, "spgout", 0); } splx(s); /* * clean all of the pages that are currently active and finished */ (void) swap_pager_clean(); s = splbio(); /* * clear source block before destination object * (release allocated space) */ for (i = 0; i < offset + srcoffset; i += PAGE_SIZE) { int valid; int *addr = swap_pager_diskaddr(srcswp, i, &valid); if (addr && *addr != SWB_EMPTY) { swap_pager_freeswapspace(*addr, *addr+btodb(PAGE_SIZE) - 1); if( valid) vm_swap_size += btodb(PAGE_SIZE); swapsizecheck(); *addr = SWB_EMPTY; } } /* * transfer source to destination */ for (i = 0; i < dstswp->sw_osize; i += PAGE_SIZE) { int srcvalid, dstvalid; int *srcaddrp = swap_pager_diskaddr(srcswp, i + offset + srcoffset, &srcvalid); int *dstaddrp; /* * see if the source has space allocated */ if (srcaddrp && *srcaddrp != SWB_EMPTY) { /* * if the source is valid and the dest has no space, then * copy the allocation from the srouce to the dest. */ if (srcvalid) { dstaddrp = swap_pager_diskaddr(dstswp, i + dstoffset, &dstvalid); /* * if the dest already has a valid block, deallocate the * source block without copying. */ if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { swap_pager_freeswapspace(*dstaddrp, *dstaddrp+btodb(PAGE_SIZE) - 1); *dstaddrp = SWB_EMPTY; } if (dstaddrp && *dstaddrp == SWB_EMPTY) { *dstaddrp = *srcaddrp; *srcaddrp = SWB_EMPTY; swap_pager_setvalid(dstswp, i + dstoffset, 1); vm_swap_size -= btodb(PAGE_SIZE); } } /* * if the source is not empty at this point, then deallocate the space. */ if (*srcaddrp != SWB_EMPTY) { swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); if( srcvalid) vm_swap_size += btodb(PAGE_SIZE); *srcaddrp = SWB_EMPTY; } } } /* * deallocate the rest of the source object */ for (i = dstswp->sw_osize + offset + srcoffset; i < srcswp->sw_osize; i += PAGE_SIZE) { int valid; int *srcaddrp = swap_pager_diskaddr(srcswp, i, &valid); if (srcaddrp && *srcaddrp != SWB_EMPTY) { swap_pager_freeswapspace(*srcaddrp, *srcaddrp+btodb(PAGE_SIZE) - 1); if( valid) vm_swap_size += btodb(PAGE_SIZE); *srcaddrp = SWB_EMPTY; } } swapsizecheck(); splx(s); free((caddr_t)srcswp->sw_blocks, M_VMPGDATA); srcswp->sw_blocks = 0; free((caddr_t)srcswp, M_VMPGDATA); srcpager->pg_data = 0; free((caddr_t)srcpager, M_VMPAGER); return; } void swap_pager_dealloc(pager) vm_pager_t pager; { register int i,j; register sw_blk_t bp; register sw_pager_t swp; int s; /* * Remove from list right away so lookups will fail if we * block for pageout completion. */ s = splbio(); swp = (sw_pager_t) pager->pg_data; if (swp->sw_flags & SW_NAMED) { TAILQ_REMOVE(&swap_pager_list, pager, pg_list); swp->sw_flags &= ~SW_NAMED; } else { TAILQ_REMOVE(&swap_pager_un_list, pager, pg_list); } /* * Wait for all pageouts to finish and remove * all entries from cleaning list. */ while (swp->sw_poip) { tsleep((caddr_t)swp, PVM, "swpout", 0); } splx(s); (void) swap_pager_clean(); /* * Free left over swap blocks */ s = splbio(); for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) { for (j = 0; j < SWB_NPAGES; j++) if (bp->swb_block[j] != SWB_EMPTY) { swap_pager_freeswapspace((unsigned)bp->swb_block[j], (unsigned)bp->swb_block[j] + btodb(PAGE_SIZE) - 1); if( bp->swb_valid & (1<swb_block[j] = SWB_EMPTY; } } splx(s); swapsizecheck(); /* * Free swap management resources */ free((caddr_t)swp->sw_blocks, M_VMPGDATA); swp->sw_blocks = 0; free((caddr_t)swp, M_VMPGDATA); pager->pg_data = 0; free((caddr_t)pager, M_VMPAGER); } /* * swap_pager_getmulti can get multiple pages. */ int swap_pager_getmulti(pager, m, count, reqpage, sync) vm_pager_t pager; vm_page_t *m; int count; int reqpage; boolean_t sync; { if( reqpage >= count) panic("swap_pager_getmulti: reqpage >= count\n"); return swap_pager_input((sw_pager_t) pager->pg_data, m, count, reqpage); } /* * swap_pager_getpage gets individual pages */ int swap_pager_getpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { vm_page_t marray[1]; marray[0] = m; return swap_pager_input((sw_pager_t)pager->pg_data, marray, 1, 0); } int swap_pager_putmulti(pager, m, c, sync, rtvals) vm_pager_t pager; vm_page_t *m; int c; boolean_t sync; int *rtvals; { int flags; if (pager == NULL) { (void) swap_pager_clean(); return VM_PAGER_OK; } flags = B_WRITE; if (!sync) flags |= B_ASYNC; return swap_pager_output((sw_pager_t)pager->pg_data, m, c, flags, rtvals); } /* * swap_pager_putpage writes individual pages */ int swap_pager_putpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { int flags; vm_page_t marray[1]; int rtvals[1]; if (pager == NULL) { (void) swap_pager_clean(); return VM_PAGER_OK; } marray[0] = m; flags = B_WRITE; if (!sync) flags |= B_ASYNC; swap_pager_output((sw_pager_t)pager->pg_data, marray, 1, flags, rtvals); return rtvals[0]; } static inline int const swap_pager_block_index(swp, offset) sw_pager_t swp; vm_offset_t offset; { return (offset / (SWB_NPAGES*PAGE_SIZE)); } static inline int const swap_pager_block_offset(swp, offset) sw_pager_t swp; vm_offset_t offset; { return ((offset % (PAGE_SIZE*SWB_NPAGES)) / PAGE_SIZE); } /* * _swap_pager_haspage returns TRUE if the pager has data that has * been written out. */ static boolean_t _swap_pager_haspage(swp, offset) sw_pager_t swp; vm_offset_t offset; { register sw_blk_t swb; int ix; ix = offset / (SWB_NPAGES*PAGE_SIZE); if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { return(FALSE); } swb = &swp->sw_blocks[ix]; ix = (offset % (SWB_NPAGES*PAGE_SIZE)) / PAGE_SIZE; if (swb->swb_block[ix] != SWB_EMPTY) { if (swb->swb_valid & (1 << ix)) return TRUE; } return(FALSE); } /* * swap_pager_haspage is the externally accessible version of * _swap_pager_haspage above. this routine takes a vm_pager_t * for an argument instead of sw_pager_t. */ boolean_t swap_pager_haspage(pager, offset) vm_pager_t pager; vm_offset_t offset; { return _swap_pager_haspage((sw_pager_t) pager->pg_data, offset); } /* * swap_pager_freepage is a convienience routine that clears the busy * bit and deallocates a page. */ static void swap_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * swap_pager_ridpages is a convienience routine that deallocates all * but the required page. this is usually used in error returns that * need to invalidate the "extra" readahead pages. */ static void swap_pager_ridpages(m, count, reqpage) vm_page_t *m; int count; int reqpage; { int i; for (i = 0; i < count; i++) if (i != reqpage) swap_pager_freepage(m[i]); } int swapwritecount=0; /* * swap_pager_iodone1 is the completion routine for both reads and async writes */ void swap_pager_iodone1(bp) struct buf *bp; { bp->b_flags |= B_DONE; bp->b_flags &= ~B_ASYNC; wakeup((caddr_t)bp); /* if ((bp->b_flags & B_READ) == 0) vwakeup(bp); */ } int swap_pager_input(swp, m, count, reqpage) register sw_pager_t swp; vm_page_t *m; int count, reqpage; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i; boolean_t rv; vm_offset_t kva, off[count]; swp_clean_t spc; vm_offset_t paging_offset; vm_object_t object; int reqaddr[count]; int first, last; int failed; int reqdskregion; object = m[reqpage]->object; paging_offset = object->paging_offset; /* * First determine if the page exists in the pager if this is * a sync read. This quickly handles cases where we are * following shadow chains looking for the top level object * with the page. */ if (swp->sw_blocks == NULL) { swap_pager_ridpages(m, count, reqpage); return(VM_PAGER_FAIL); } for(i = 0; i < count; i++) { vm_offset_t foff = m[i]->offset + paging_offset; int ix = swap_pager_block_index(swp, foff); if (ix >= swp->sw_nblocks) { int j; if( i <= reqpage) { swap_pager_ridpages(m, count, reqpage); return(VM_PAGER_FAIL); } for(j = i; j < count; j++) { swap_pager_freepage(m[j]); } count = i; break; } swb[i] = &swp->sw_blocks[ix]; off[i] = swap_pager_block_offset(swp, foff); reqaddr[i] = swb[i]->swb_block[off[i]]; } /* make sure that our required input request is existant */ if (reqaddr[reqpage] == SWB_EMPTY || (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { swap_pager_ridpages(m, count, reqpage); return(VM_PAGER_FAIL); } reqdskregion = reqaddr[reqpage] / dmmax; /* * search backwards for the first contiguous page to transfer */ failed = 0; first = 0; for (i = reqpage - 1; i >= 0; --i) { if ( failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (first == 0) first = i + 1; } } /* * search forwards for the last contiguous page to transfer */ failed = 0; last = count; for (i = reqpage + 1; i < count; i++) { if ( failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (last == count) last = i; } } count = last; if (first != 0) { for (i = first; i < count; i++) { m[i-first] = m[i]; reqaddr[i-first] = reqaddr[i]; off[i-first] = off[i]; } count -= first; reqpage -= first; } ++swb[reqpage]->swb_locked; /* * at this point: * "m" is a pointer to the array of vm_page_t for paging I/O * "count" is the number of vm_page_t entries represented by "m" * "object" is the vm_object_t for I/O * "reqpage" is the index into "m" for the page actually faulted */ spc = NULL; /* we might not use an spc data structure */ - kva = 0; - /* - * we allocate a new kva for transfers > 1 page - * but for transfers == 1 page, the swap_pager_free list contains - * entries that have pre-allocated kva's (for efficiency). - */ - if (count > 1) { - kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); - } - - - if (!kva) { + if (count == 1) { /* * if a kva has not been allocated, we can only do a one page transfer, * so we free the other pages that might have been allocated by * vm_fault. */ swap_pager_ridpages(m, count, reqpage); m[0] = m[reqpage]; reqaddr[0] = reqaddr[reqpage]; count = 1; reqpage = 0; /* * get a swap pager clean data structure, block until we get it */ if (swap_pager_free.tqh_first == NULL) { s = splbio(); if( curproc == pageproc) (void) swap_pager_clean(); else wakeup((caddr_t) &vm_pages_needed); while (swap_pager_free.tqh_first == NULL) { swap_pager_needflags |= SWAP_FREE_NEEDED; tsleep((caddr_t)&swap_pager_free, PVM, "swpfre", 0); if( curproc == pageproc) (void) swap_pager_clean(); else wakeup((caddr_t) &vm_pages_needed); } splx(s); } spc = swap_pager_free.tqh_first; TAILQ_REMOVE(&swap_pager_free, spc, spc_list); kva = spc->spc_kva; - } - - - /* - * map our page(s) into kva for input - */ - for (i = 0; i < count; i++) { - pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); - } - pmap_update(); - - - /* - * Get a swap buffer header and perform the IO - */ - if( spc) { bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; } else { + /* + * Get a swap buffer header to perform the IO + */ bp = getpbuf(); + kva = (vm_offset_t) bp->b_data; } + /* + * map our page(s) into kva for input + */ + pmap_qenter( kva, m, count); + s = splbio(); bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = swap_pager_iodone1; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; crhold(bp->b_rcred); crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr[0]; bp->b_bcount = PAGE_SIZE*count; bp->b_bufsize = PAGE_SIZE*count; -/* - VHOLD(swapdev_vp); - bp->b_vp = swapdev_vp; - if (swapdev_vp->v_type == VBLK) - bp->b_dev = swapdev_vp->v_rdev; -*/ bgetvp( swapdev_vp, bp); swp->sw_piip++; /* * perform the I/O */ VOP_STRATEGY(bp); /* * wait for the sync I/O to complete */ while ((bp->b_flags & B_DONE) == 0) { tsleep((caddr_t)bp, PVM, "swread", 0); } rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); --swp->sw_piip; if (swp->sw_piip == 0) wakeup((caddr_t) swp); /* * relpbuf does this, but we maintain our own buffer * list also... */ if (bp->b_vp) brelvp(bp); splx(s); --swb[reqpage]->swb_locked; /* * remove the mapping for kernel virtual */ - pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + pmap_qremove( kva, count); if (spc) { /* * if we have used an spc, we need to free it. */ if( bp->b_rcred != NOCRED) crfree(bp->b_rcred); if( bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup((caddr_t)&swap_pager_free); } } else { /* - * free the kernel virtual addresses - */ - kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); - /* * release the physical I/O buffer */ relpbuf(bp); /* * finish up input if everything is ok */ if( rv == VM_PAGER_OK) { for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->flags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; if (i != reqpage) { /* * whether or not to leave the page activated * is up in the air, but we should put the page * on a page queue somewhere. (it already is in * the object). * After some emperical results, it is best * to deactivate the readahead pages. */ vm_page_deactivate(m[i]); /* * just in case someone was asking for this * page we now tell them that it is ok to use */ m[i]->flags &= ~PG_FAKE; PAGE_WAKEUP(m[i]); } } if( swap_pager_full) { _swap_pager_freespace( swp, m[0]->offset+paging_offset, count*PAGE_SIZE); } } else { swap_pager_ridpages(m, count, reqpage); } } return(rv); } int swap_pager_output(swp, m, count, flags, rtvals) register sw_pager_t swp; vm_page_t *m; int count; int flags; int *rtvals; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i, j, ix; boolean_t rv; vm_offset_t kva, off, foff; swp_clean_t spc; vm_offset_t paging_offset; vm_object_t object; int reqaddr[count]; int failed; /* if( count > 1) printf("off: 0x%x, count: %d\n", m[0]->offset, count); */ spc = NULL; object = m[0]->object; paging_offset = object->paging_offset; failed = 0; for(j=0;joffset + paging_offset; ix = swap_pager_block_index(swp, foff); swb[j] = 0; if( swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { rtvals[j] = VM_PAGER_FAIL; failed = 1; continue; } else { rtvals[j] = VM_PAGER_OK; } swb[j] = &swp->sw_blocks[ix]; ++swb[j]->swb_locked; if( failed) { rtvals[j] = VM_PAGER_FAIL; continue; } off = swap_pager_block_offset(swp, foff); reqaddr[j] = swb[j]->swb_block[off]; if( reqaddr[j] == SWB_EMPTY) { int blk; int tries; int ntoget; tries = 0; s = splbio(); /* * if any other pages have been allocated in this block, we * only try to get one page. */ for (i = 0; i < SWB_NPAGES; i++) { if (swb[j]->swb_block[i] != SWB_EMPTY) break; } ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; /* * this code is alittle conservative, but works * (the intent of this code is to allocate small chunks * for small objects) */ if( (m[j]->offset == 0) && (ntoget*PAGE_SIZE > object->size)) { ntoget = (object->size + (PAGE_SIZE-1))/PAGE_SIZE; } retrygetspace: if (!swap_pager_full && ntoget > 1 && swap_pager_getswapspace(ntoget * btodb(PAGE_SIZE), &blk)) { for (i = 0; i < ntoget; i++) { swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; swb[j]->swb_valid = 0; } reqaddr[j] = swb[j]->swb_block[off]; } else if (!swap_pager_getswapspace(btodb(PAGE_SIZE), &swb[j]->swb_block[off])) { /* * if the allocation has failed, we try to reclaim space and * retry. */ if (++tries == 1) { swap_pager_reclaim(); goto retrygetspace; } rtvals[j] = VM_PAGER_AGAIN; failed = 1; } else { reqaddr[j] = swb[j]->swb_block[off]; swb[j]->swb_valid &= ~(1<swb_locked; } } for(i = 0; i < count; i++) if( rtvals[i] != VM_PAGER_OK) break; if( i == 0) { return VM_PAGER_AGAIN; } count = i; for(i=0;i 1 page * but for transfers == 1 page, the swap_pager_free list contains * entries that have pre-allocated kva's (for efficiency). + * NOTE -- we do not use the physical buffer pool or the + * preallocated associated kva's because of the potential for + * deadlock. This is very subtile -- but deadlocks or resource + * contention must be avoided on pageouts -- or your system will + * sleep (forever) !!! */ if ( count > 1) { kva = kmem_alloc_pageable(pager_map, count*PAGE_SIZE); if( !kva) { for (i = 0; i < count; i++) { if( swb[i]) --swb[i]->swb_locked; rtvals[i] = VM_PAGER_AGAIN; } return VM_PAGER_AGAIN; } } /* * get a swap pager clean data structure, block until we get it */ if (swap_pager_free.tqh_first == NULL) { /* if (flags & B_ASYNC) { for(i=0;iswb_locked; } return VM_PAGER_AGAIN; } */ s = splbio(); if( curproc == pageproc) (void) swap_pager_clean(); else wakeup((caddr_t) &vm_pages_needed); while (swap_pager_free.tqh_first == NULL) { swap_pager_needflags |= SWAP_FREE_NEEDED; tsleep((caddr_t)&swap_pager_free, PVM, "swpfre", 0); if( curproc == pageproc) (void) swap_pager_clean(); else wakeup((caddr_t) &vm_pages_needed); } splx(s); } spc = swap_pager_free.tqh_first; TAILQ_REMOVE(&swap_pager_free, spc, spc_list); if( !kva) { kva = spc->spc_kva; spc->spc_altkva = 0; } else { spc->spc_altkva = kva; } /* * map our page(s) into kva for I/O */ - for (i = 0; i < count; i++) { - pmap_kenter( kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); - } - pmap_update(); + pmap_qenter(kva, m, count); /* * get the base I/O offset into the swap file */ for(i=0;ioffset + paging_offset; off = swap_pager_block_offset(swp, foff); /* * if we are setting the valid bit anew, * then diminish the swap free space */ if( (swb[i]->swb_valid & (1 << off)) == 0) vm_swap_size -= btodb(PAGE_SIZE); /* * set the valid bit */ swb[i]->swb_valid |= (1 << off); /* * and unlock the data structure */ --swb[i]->swb_locked; } s = splbio(); /* * Get a swap buffer header and perform the IO */ bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; bp->b_flags = B_BUSY; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; crhold(bp->b_rcred); crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr[0]; bgetvp( swapdev_vp, bp); -/* - VHOLD(swapdev_vp); - bp->b_vp = swapdev_vp; - if (swapdev_vp->v_type == VBLK) - bp->b_dev = swapdev_vp->v_rdev; -*/ + bp->b_bcount = PAGE_SIZE*count; bp->b_bufsize = PAGE_SIZE*count; swapdev_vp->v_numoutput++; /* * If this is an async write we set up additional buffer fields * and place a "cleaning" entry on the inuse queue. */ if ( flags & B_ASYNC ) { spc->spc_flags = 0; spc->spc_swp = swp; for(i=0;ispc_m[i] = m[i]; spc->spc_count = count; /* * the completion routine for async writes */ bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; swp->sw_poip++; TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); } else { swp->sw_poip++; bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone1; } /* * perform the I/O */ VOP_STRATEGY(bp); if ((flags & (B_READ|B_ASYNC)) == B_ASYNC ) { if ((bp->b_flags & B_DONE) == B_DONE) { swap_pager_clean(); } splx(s); for(i=0;ib_flags & B_DONE) == 0) { tsleep((caddr_t)bp, PVM, "swwrt", 0); } rv = (bp->b_flags & B_ERROR) ? VM_PAGER_FAIL : VM_PAGER_OK; bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_CALL|B_DONE); --swp->sw_poip; if (swp->sw_poip == 0) wakeup((caddr_t) swp); if (bp->b_vp) brelvp(bp); splx(s); /* * remove the mapping for kernel virtual */ - pmap_remove(vm_map_pmap(pager_map), kva, kva + count * PAGE_SIZE); + pmap_qremove( kva, count); /* * if we have written the page, then indicate that the page * is clean. */ if (rv == VM_PAGER_OK) { for(i=0;iflags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); /* * optimization, if a page has been read during the * pageout process, we activate it. */ if ( (m[i]->flags & PG_ACTIVE) == 0 && pmap_is_referenced(VM_PAGE_TO_PHYS(m[i]))) vm_page_activate(m[i]); } } } else { for(i=0;iflags |= PG_LAUNDRY; } } if( spc->spc_altkva) kmem_free_wakeup(pager_map, kva, count * PAGE_SIZE); if( bp->b_rcred != NOCRED) crfree(bp->b_rcred); if( bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup((caddr_t)&swap_pager_free); } return(rv); } boolean_t swap_pager_clean() { register swp_clean_t spc, tspc; register int s; tspc = NULL; if (swap_pager_done.tqh_first == NULL) return FALSE; for (;;) { s = splbio(); /* * Look up and removal from done list must be done * at splbio() to avoid conflicts with swap_pager_iodone. */ while (spc = swap_pager_done.tqh_first) { if( spc->spc_altkva) { - pmap_remove(vm_map_pmap(pager_map), spc->spc_altkva, spc->spc_altkva + spc->spc_count * PAGE_SIZE); + pmap_qremove( spc->spc_altkva, spc->spc_count); kmem_free_wakeup(pager_map, spc->spc_altkva, spc->spc_count * PAGE_SIZE); spc->spc_altkva = 0; } else { - pmap_remove(vm_map_pmap(pager_map), spc->spc_kva, spc->spc_kva + PAGE_SIZE); + pmap_qremove( spc->spc_kva, 1); } swap_pager_finish(spc); TAILQ_REMOVE(&swap_pager_done, spc, spc_list); goto doclean; } /* * No operations done, thats all we can do for now. */ splx(s); break; /* * The desired page was found to be busy earlier in * the scan but has since completed. */ doclean: if (tspc && tspc == spc) { tspc = NULL; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup((caddr_t)&swap_pager_free); } ++cleandone; splx(s); } return(tspc ? TRUE : FALSE); } void swap_pager_finish(spc) register swp_clean_t spc; { vm_object_t object = spc->spc_m[0]->object; int i; if ((object->paging_in_progress -= spc->spc_count) == 0) thread_wakeup((int) object); /* * If no error mark as clean and inform the pmap system. * If error, mark as dirty so we will try again. * (XXX could get stuck doing this, should give up after awhile) */ if (spc->spc_flags & SPC_ERROR) { for(i=0;ispc_count;i++) { printf("swap_pager_finish: clean of page %x failed\n", VM_PAGE_TO_PHYS(spc->spc_m[i])); spc->spc_m[i]->flags |= PG_LAUNDRY; } } else { for(i=0;ispc_count;i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i])); spc->spc_m[i]->flags |= PG_CLEAN; } } for(i=0;ispc_count;i++) { /* * we wakeup any processes that are waiting on * these pages. */ PAGE_WAKEUP(spc->spc_m[i]); } nswiodone -= spc->spc_count; return; } /* * swap_pager_iodone */ void swap_pager_iodone(bp) register struct buf *bp; { register swp_clean_t spc; int s; s = splbio(); spc = (swp_clean_t) bp->b_spc; TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); if (bp->b_flags & B_ERROR) { spc->spc_flags |= SPC_ERROR; printf("error %d blkno %d sz %d ", bp->b_error, bp->b_blkno, bp->b_bcount); } /* if ((bp->b_flags & B_READ) == 0) vwakeup(bp); */ bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_DIRTY|B_ASYNC); if (bp->b_vp) { brelvp(bp); } if( bp->b_rcred != NOCRED) crfree(bp->b_rcred); if( bp->b_wcred != NOCRED) crfree(bp->b_wcred); nswiodone += spc->spc_count; if (--spc->spc_swp->sw_poip == 0) { wakeup((caddr_t)spc->spc_swp); } if ((swap_pager_needflags & SWAP_FREE_NEEDED) || swap_pager_inuse.tqh_first == 0) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup((caddr_t)&swap_pager_free); wakeup((caddr_t)&vm_pages_needed); } if (vm_pageout_pages_needed) { wakeup((caddr_t)&vm_pageout_pages_needed); } if ((swap_pager_inuse.tqh_first == NULL) || (cnt.v_free_count < cnt.v_free_min && nswiodone + cnt.v_free_count >= cnt.v_free_min) ) { wakeup((caddr_t)&vm_pages_needed); - } - splx(s); -} - -int bswneeded; -/* TAILQ_HEAD(swqueue, buf) bswlist; */ -/* - * allocate a physical buffer - */ -struct buf * -getpbuf() { - int s; - struct buf *bp; - - s = splbio(); - /* get a bp from the swap buffer header pool */ - while ((bp = bswlist.tqh_first) == NULL) { - bswneeded = 1; - tsleep((caddr_t)&bswneeded, PVM, "wswbuf", 0); - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - - splx(s); - - bzero(bp, sizeof *bp); - bp->b_rcred = NOCRED; - bp->b_wcred = NOCRED; - return bp; -} - -/* - * allocate a physical buffer, if one is available - */ -struct buf * -trypbuf() { - int s; - struct buf *bp; - - s = splbio(); - if ((bp = bswlist.tqh_first) == NULL) { - splx(s); - return NULL; - } - TAILQ_REMOVE(&bswlist, bp, b_freelist); - splx(s); - - bzero(bp, sizeof *bp); - bp->b_rcred = NOCRED; - bp->b_wcred = NOCRED; - return bp; -} - -/* - * release a physical buffer - */ -void -relpbuf(bp) - struct buf *bp; -{ - int s; - - s = splbio(); - - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } - - if (bp->b_vp) - brelvp(bp); - - TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); - - if (bswneeded) { - bswneeded = 0; - wakeup((caddr_t)&bswlist); } splx(s); } /* * return true if any swap control structures can be allocated */ int swap_pager_ready() { if( swap_pager_free.tqh_first) return 1; else return 0; } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 1886) +++ head/sys/vm/vm_fault.c (revision 1887) @@ -1,1307 +1,1310 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id$ + * $Id: vm_fault.c,v 1.3 1994/08/02 07:55:18 davidg Exp $ */ /* * Page fault handling module. */ #include #include #include #include #include #include #include #define VM_FAULT_READ_AHEAD 4 #define VM_FAULT_READ_AHEAD_MIN 1 #define VM_FAULT_READ_BEHIND 3 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) extern int swap_pager_full; extern int vm_pageout_proc_limit; /* * vm_fault: * * Handle a page fault occuring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(map, vaddr, fault_type, change_wiring) vm_map_t map; vm_offset_t vaddr; vm_prot_t fault_type; boolean_t change_wiring; { vm_object_t first_object; vm_offset_t first_offset; vm_map_entry_t entry; register vm_object_t object; register vm_offset_t offset; vm_page_t m; vm_page_t first_m; vm_prot_t prot; int result; boolean_t wired; boolean_t su; boolean_t lookup_still_valid; boolean_t page_exists; vm_page_t old_m; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ]; int reqpage; int spl; int hardfault=0; cnt.v_faults++; /* needs lock XXX */ /* * Recovery actions */ #define FREE_PAGE(m) { \ PAGE_WAKEUP(m); \ vm_page_lock_queues(); \ vm_page_free(m); \ vm_page_unlock_queues(); \ } #define RELEASE_PAGE(m) { \ PAGE_WAKEUP(m); \ vm_page_lock_queues(); \ vm_page_activate(m); \ vm_page_unlock_queues(); \ } #define UNLOCK_MAP { \ if (lookup_still_valid) { \ vm_map_lookup_done(map, entry); \ lookup_still_valid = FALSE; \ } \ } #define UNLOCK_THINGS { \ object->paging_in_progress--; \ if (object->paging_in_progress == 0) \ wakeup((caddr_t)object); \ vm_object_unlock(object); \ if (object != first_object) { \ vm_object_lock(first_object); \ FREE_PAGE(first_m); \ first_object->paging_in_progress--; \ if (first_object->paging_in_progress == 0) \ wakeup((caddr_t)first_object); \ vm_object_unlock(first_object); \ } \ UNLOCK_MAP; \ } #define UNLOCK_AND_DEALLOCATE { \ UNLOCK_THINGS; \ vm_object_deallocate(first_object); \ } RetryFault: ; /* * Find the backing store object and offset into * it to begin the search. */ if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, &first_object, &first_offset, &prot, &wired, &su)) != KERN_SUCCESS) { return(result); } lookup_still_valid = TRUE; if (wired) fault_type = prot; first_m = NULL; /* * Make a reference to this object to * prevent its disposal while we are messing with * it. Once we have the reference, the map is free * to be diddled. Since objects reference their * shadows (and copies), they will stay around as well. */ vm_object_lock(first_object); first_object->ref_count++; first_object->paging_in_progress++; /* * INVARIANTS (through entire routine): * * 1) At all times, we must either have the object * lock or a busy page in some object to prevent * some other thread from trying to bring in * the same page. * * Note that we cannot hold any locks during the * pager access or when waiting for memory, so * we use a busy page then. * * Note also that we aren't as concerned about * more than one thead attempting to pager_data_unlock * the same page at once, so we don't hold the page * as busy then, but do record the highest unlock * value so far. [Unlock requests may also be delivered * out of order.] * * 2) Once we have a busy page, we must remove it from * the pageout queues, so that the pageout daemon * will not grab it away. * * 3) To prevent another thread from racing us down the * shadow chain and entering a new page in the top * object before we do, we must keep a busy page in * the top object while following the shadow chain. * * 4) We must increment paging_in_progress on any object * for which we have a busy page, to prevent * vm_object_collapse from removing the busy page * without our noticing. */ /* * Search for the page at object/offset. */ object = first_object; offset = first_offset; /* * See whether this page is resident */ while (TRUE) { m = vm_page_lookup(object, offset); if (m != NULL) { /* * If the page is being brought in, * wait for it and then retry. */ if (m->flags & PG_BUSY) { + int s; UNLOCK_THINGS; + s = splhigh(); if (m->flags & PG_BUSY) { m->flags |= PG_WANTED; tsleep((caddr_t)m,PSWP,"vmpfw",0); } + splx(s); vm_object_deallocate(first_object); goto RetryFault; } /* * Remove the page from the pageout daemon's * reach while we play with it. */ vm_page_lock_queues(); - spl = splimp(); + spl = splhigh(); if (m->flags & PG_INACTIVE) { TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); m->flags &= ~PG_INACTIVE; cnt.v_inactive_count--; cnt.v_reactivated++; } if (m->flags & PG_ACTIVE) { TAILQ_REMOVE(&vm_page_queue_active, m, pageq); m->flags &= ~PG_ACTIVE; cnt.v_active_count--; } splx(spl); vm_page_unlock_queues(); /* * Mark page busy for other threads. */ m->flags |= PG_BUSY; break; } if (((object->pager != NULL) && (!change_wiring || wired)) || (object == first_object)) { #if 0 if (curproc && (vaddr < VM_MAXUSER_ADDRESS) && (curproc->p_rlimit[RLIMIT_RSS].rlim_max < curproc->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG)) { UNLOCK_AND_DEALLOCATE; vm_fault_free_pages(curproc); goto RetryFault; } #endif if (swap_pager_full && !object->shadow && (!object->pager || (object->pager && object->pager->pg_type == PG_SWAP && !vm_pager_has_page(object->pager, offset+object->paging_offset)))) { if (vaddr < VM_MAXUSER_ADDRESS && curproc && curproc->p_pid >= 48) /* XXX */ { printf("Process %d killed by vm_fault -- out of swap\n", curproc->p_pid); psignal(curproc, SIGKILL); curproc->p_estcpu = 0; curproc->p_nice = PRIO_MIN; setpriority(curproc); } } /* * Allocate a new page for this object/offset * pair. */ m = vm_page_alloc(object, offset); if (m == NULL) { UNLOCK_AND_DEALLOCATE; VM_WAIT; goto RetryFault; } } if (object->pager != NULL && (!change_wiring || wired)) { int rv; int faultcount; int reqpage; /* * Now that we have a busy page, we can * release the object lock. */ vm_object_unlock(object); /* * now we find out if any other pages should * be paged in at this time * this routine checks to see if the pages surrounding this fault * reside in the same object as the page for this fault. If * they do, then they are faulted in also into the * object. The array "marray" returned contains an array of * vm_page_t structs where one of them is the vm_page_t passed to * the routine. The reqpage return value is the index into the * marray for the vm_page_t passed to the routine. */ cnt.v_pageins++; faultcount = vm_fault_additional_pages(first_object, first_offset, m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD, marray, &reqpage); /* * Call the pager to retrieve the data, if any, * after releasing the lock on the map. */ UNLOCK_MAP; rv = faultcount ? vm_pager_get_pages(object->pager, marray, faultcount, reqpage, TRUE): VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. * Leave it busy while we play with it. */ vm_object_lock(object); /* * Relookup in case pager changed page. * Pager is responsible for disposition * of old page if moved. */ m = vm_page_lookup(object, offset); cnt.v_pgpgin++; m->flags &= ~PG_FAKE; pmap_clear_modify(VM_PAGE_TO_PHYS(m)); hardfault++; break; } /* * Remove the bogus page (which does not * exist at this object/offset); before * doing so, we must get back our object * lock to preserve our invariant. * * Also wake up any other thread that may want * to bring in this page. * * If this is the top-level object, we must * leave the busy page to prevent another * thread from rushing past us, and inserting * the page in that object at the same time * that we are. */ vm_object_lock(object); /* * Data outside the range of the pager; an error */ if ((rv == VM_PAGER_ERROR) || (rv == VM_PAGER_BAD)) { FREE_PAGE(m); UNLOCK_AND_DEALLOCATE; return(KERN_PROTECTION_FAILURE); /* XXX */ } if (object != first_object) { FREE_PAGE(m); /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has no pager (or unwiring) * or the pager doesn't have the page. */ if (object == first_object) first_m = m; /* * Move on to the next object. Lock the next * object before unlocking the current one. */ offset += object->shadow_offset; next_object = object->shadow; if (next_object == NULL) { /* * If there's no object left, fill the page * in the top object with zeros. */ if (object != first_object) { object->paging_in_progress--; if (object->paging_in_progress == 0) wakeup((caddr_t) object); vm_object_unlock(object); object = first_object; offset = first_offset; m = first_m; vm_object_lock(object); } first_m = NULL; vm_page_zero_fill(m); cnt.v_zfod++; m->flags &= ~PG_FAKE; break; } else { vm_object_lock(next_object); if (object != first_object) { object->paging_in_progress--; if (object->paging_in_progress == 0) wakeup((caddr_t) object); } vm_object_unlock(object); object = next_object; object->paging_in_progress++; } } if ((m->flags & (PG_ACTIVE|PG_INACTIVE) != 0) || (m->flags & PG_BUSY) == 0) panic("vm_fault: absent or active or inactive or not busy after main loop"); /* * PAGE HAS BEEN FOUND. * [Loop invariant still holds -- the object lock * is held.] */ old_m = m; /* save page that would be copied */ /* * If the page is being written, but isn't * already owned by the top-level object, * we have to copy it into a new page owned * by the top-level object. */ if (object != first_object) { /* * We only really need to copy if we * want to write it. */ if (fault_type & VM_PROT_WRITE) { /* * If we try to collapse first_object at this * point, we may deadlock when we try to get * the lock on an intermediate object (since we * have the bottom object locked). We can't * unlock the bottom object, because the page * we found may move (by collapse) if we do. * * Instead, we first copy the page. Then, when * we have no more use for the bottom object, * we unlock it and try to collapse. * * Note that we copy the page even if we didn't * need to... that's the breaks. */ /* * We already have an empty page in * first_object - use it. */ vm_page_copy(m, first_m); first_m->flags &= ~PG_FAKE; /* * If another map is truly sharing this * page with us, we have to flush all * uses of the original page, since we * can't distinguish those which want the * original from those which need the * new copy. * * XXX If we know that only one map has * access to this page, then we could * avoid the pmap_page_protect() call. */ vm_page_lock_queues(); vm_page_activate(m); pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); if ((m->flags & PG_CLEAN) == 0) m->flags |= PG_LAUNDRY; vm_page_unlock_queues(); /* * We no longer need the old page or object. */ PAGE_WAKEUP(m); object->paging_in_progress--; if (object->paging_in_progress == 0) wakeup((caddr_t) object); vm_object_unlock(object); /* * Only use the new page below... */ cnt.v_cow_faults++; m = first_m; object = first_object; offset = first_offset; /* * Now that we've gotten the copy out of the * way, let's try to collapse the top object. */ vm_object_lock(object); /* * But we have to play ugly games with * paging_in_progress to do that... */ object->paging_in_progress--; if (object->paging_in_progress == 0) wakeup((caddr_t) object); vm_object_collapse(object); object->paging_in_progress++; } else { prot &= ~VM_PROT_WRITE; m->flags |= PG_COPYONWRITE; } } if (m->flags & (PG_ACTIVE|PG_INACTIVE)) panic("vm_fault: active or inactive before copy object handling"); /* * If the page is being written, but hasn't been * copied to the copy-object, we have to copy it there. */ RetryCopy: if (first_object->copy != NULL) { vm_object_t copy_object = first_object->copy; vm_offset_t copy_offset; vm_page_t copy_m; /* * We only need to copy if we want to write it. */ if ((fault_type & VM_PROT_WRITE) == 0) { prot &= ~VM_PROT_WRITE; m->flags |= PG_COPYONWRITE; } else { /* * Try to get the lock on the copy_object. */ if (!vm_object_lock_try(copy_object)) { vm_object_unlock(object); /* should spin a bit here... */ vm_object_lock(object); goto RetryCopy; } /* * Make another reference to the copy-object, * to keep it from disappearing during the * copy. */ copy_object->ref_count++; /* * Does the page exist in the copy? */ copy_offset = first_offset - copy_object->shadow_offset; copy_m = vm_page_lookup(copy_object, copy_offset); if (page_exists = (copy_m != NULL)) { if (copy_m->flags & PG_BUSY) { /* * If the page is being brought * in, wait for it and then retry. */ PAGE_ASSERT_WAIT(copy_m, !change_wiring); RELEASE_PAGE(m); copy_object->ref_count--; vm_object_unlock(copy_object); UNLOCK_THINGS; thread_block("fltcpy"); vm_object_deallocate(first_object); goto RetryFault; } } /* * If the page is not in memory (in the object) * and the object has a pager, we have to check * if the pager has the data in secondary * storage. */ if (!page_exists) { /* * If we don't allocate a (blank) page * here... another thread could try * to page it in, allocate a page, and * then block on the busy page in its * shadow (first_object). Then we'd * trip over the busy page after we * found that the copy_object's pager * doesn't have the page... */ copy_m = vm_page_alloc(copy_object, copy_offset); if (copy_m == NULL) { /* * Wait for a page, then retry. */ RELEASE_PAGE(m); copy_object->ref_count--; vm_object_unlock(copy_object); UNLOCK_AND_DEALLOCATE; VM_WAIT; goto RetryFault; } if (copy_object->pager != NULL) { vm_object_unlock(object); vm_object_unlock(copy_object); UNLOCK_MAP; page_exists = vm_pager_has_page( copy_object->pager, (copy_offset + copy_object->paging_offset)); vm_object_lock(copy_object); /* * Since the map is unlocked, someone * else could have copied this object * and put a different copy_object * between the two. Or, the last * reference to the copy-object (other * than the one we have) may have * disappeared - if that has happened, * we don't need to make the copy. */ if (copy_object->shadow != object || copy_object->ref_count == 1) { /* * Gaah... start over! */ FREE_PAGE(copy_m); vm_object_unlock(copy_object); vm_object_deallocate(copy_object); /* may block */ vm_object_lock(object); goto RetryCopy; } vm_object_lock(object); if (page_exists) { /* * We didn't need the page */ FREE_PAGE(copy_m); } } } if (!page_exists) { /* * Must copy page into copy-object. */ vm_page_copy(m, copy_m); copy_m->flags &= ~PG_FAKE; /* * Things to remember: * 1. The copied page must be marked 'dirty' * so it will be paged out to the copy * object. * 2. If the old page was in use by any users * of the copy-object, it must be removed * from all pmaps. (We can't know which * pmaps use it.) */ vm_page_lock_queues(); vm_page_activate(old_m); pmap_page_protect(VM_PAGE_TO_PHYS(old_m), VM_PROT_NONE); if ((old_m->flags & PG_CLEAN) == 0) old_m->flags |= PG_LAUNDRY; copy_m->flags &= ~PG_CLEAN; vm_page_activate(copy_m); vm_page_unlock_queues(); PAGE_WAKEUP(copy_m); } /* * The reference count on copy_object must be * at least 2: one for our extra reference, * and at least one from the outside world * (we checked that when we last locked * copy_object). */ copy_object->ref_count--; vm_object_unlock(copy_object); m->flags &= ~PG_COPYONWRITE; } } if (m->flags & (PG_ACTIVE | PG_INACTIVE)) panic("vm_fault: active or inactive before retrying lookup"); /* * We must verify that the maps have not changed * since our last lookup. */ if (!lookup_still_valid) { vm_object_t retry_object; vm_offset_t retry_offset; vm_prot_t retry_prot; /* * Since map entries may be pageable, make sure we can * take a page fault on them. */ vm_object_unlock(object); /* * To avoid trying to write_lock the map while another * thread has it read_locked (in vm_map_pageable), we * do not try for write permission. If the page is * still writable, we will get write permission. If it * is not, or has been marked needs_copy, we enter the * mapping without write permission, and will merely * take another fault. */ result = vm_map_lookup(&map, vaddr, fault_type & ~VM_PROT_WRITE, &entry, &retry_object, &retry_offset, &retry_prot, &wired, &su); vm_object_lock(object); /* * If we don't need the page any longer, put it on the * active list (the easiest thing to do here). If no * one needs it, pageout will grab it eventually. */ if (result != KERN_SUCCESS) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; return(result); } lookup_still_valid = TRUE; if ((retry_object != first_object) || (retry_offset != first_offset)) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto RetryFault; } /* * Check whether the protection has changed or the object * has been copied while we left the map unlocked. * Changing from read to write permission is OK - we leave * the page write-protected, and catch the write fault. * Changing from write to read permission means that we * can't mark the page write-enabled after all. */ prot &= retry_prot; if (m->flags & PG_COPYONWRITE) prot &= ~VM_PROT_WRITE; } /* * (the various bits we're fiddling with here are locked by * the object's lock) */ /* XXX This distorts the meaning of the copy_on_write bit */ if (prot & VM_PROT_WRITE) m->flags &= ~PG_COPYONWRITE; /* * It's critically important that a wired-down page be faulted * only once in each map for which it is wired. */ if (m->flags & (PG_ACTIVE | PG_INACTIVE)) panic("vm_fault: active or inactive before pmap_enter"); vm_object_unlock(object); /* * Put this page into the physical map. * We had to do the unlock above because pmap_enter * may cause other faults. We don't put the * page back on the active queue until later so * that the page-out daemon won't find us (yet). */ pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired); /* * If the page is not wired down, then put it where the * pageout daemon can find it. */ vm_object_lock(object); vm_page_lock_queues(); if (change_wiring) { if (wired) vm_page_wire(m); else vm_page_unwire(m); } else { vm_page_activate(m); } if( curproc && curproc->p_stats) { if (hardfault) { curproc->p_stats->p_ru.ru_majflt++; } else { curproc->p_stats->p_ru.ru_minflt++; } } vm_page_unlock_queues(); /* * Unlock everything, and return */ PAGE_WAKEUP(m); UNLOCK_AND_DEALLOCATE; return(KERN_SUCCESS); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ int vm_fault_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va; register pmap_t pmap; int rv; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the * range of addresses may not fault, so that * page tables and such can be locked down as well. */ pmap_pageable(pmap, start, end, FALSE); /* * We simulate a fault to get the page and enter it * in the physical map. */ for (va = start; va < end; va += PAGE_SIZE) { rv = vm_fault(map, va, VM_PROT_NONE, TRUE); if (rv) { if (va != start) vm_fault_unwire(map, start, va); return(rv); } } return(KERN_SUCCESS); } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va, pa; register pmap_t pmap; pmap = vm_map_pmap(map); /* * Since the pages are wired down, we must be able to * get their mappings from the physical map system. */ vm_page_lock_queues(); for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa == (vm_offset_t) 0) { panic("unwire: page not in pmap"); } pmap_change_wiring(pmap, va, FALSE); vm_page_unwire(PHYS_TO_VM_PAGE(pa)); } vm_page_unlock_queues(); /* * Inform the physical mapping system that the range * of addresses may fault, so that page tables and * such may be unwired themselves. */ pmap_pageable(pmap, start, end, TRUE); } /* * Routine: * vm_fault_copy_entry * Function: * Copy all of the pages from a wired-down map entry to another. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) vm_map_t dst_map; vm_map_t src_map; vm_map_entry_t dst_entry; vm_map_entry_t src_entry; { vm_object_t dst_object; vm_object_t src_object; vm_offset_t dst_offset; vm_offset_t src_offset; vm_prot_t prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; #ifdef lint src_map++; #endif lint src_object = src_entry->object.vm_object; src_offset = src_entry->offset; /* * Create the top-level object for the destination entry. * (Doesn't actually shadow anything - we copy the pages * directly.) */ dst_object = vm_object_allocate( (vm_size_t) (dst_entry->end - dst_entry->start)); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; prot = dst_entry->max_protection; /* * Loop through all of the pages in the entry's range, copying * each one from the source object (it should be there) to the * destination object. */ for (vaddr = dst_entry->start, dst_offset = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { /* * Allocate a page in the destination object */ vm_object_lock(dst_object); do { dst_m = vm_page_alloc(dst_object, dst_offset); if (dst_m == NULL) { vm_object_unlock(dst_object); VM_WAIT; vm_object_lock(dst_object); } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be * in memory.) */ vm_object_lock(src_object); src_m = vm_page_lookup(src_object, dst_offset + src_offset); if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); vm_page_copy(src_m, dst_m); /* * Enter it in the pmap... */ vm_object_unlock(src_object); vm_object_unlock(dst_object); pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), prot, FALSE); /* * Mark it no longer busy, and put it on the active list. */ vm_object_lock(dst_object); vm_page_lock_queues(); vm_page_activate(dst_m); vm_page_unlock_queues(); PAGE_WAKEUP(dst_m); vm_object_unlock(dst_object); } } /* * looks page up in shadow chain */ int vm_fault_page_lookup(object, offset, rtobject, rtoffset, rtm) vm_object_t object; vm_offset_t offset; vm_object_t *rtobject; vm_offset_t *rtoffset; vm_page_t *rtm; { vm_page_t m; vm_object_t first_object = object; *rtm = 0; *rtobject = 0; *rtoffset = 0; while (!(m=vm_page_lookup(object, offset))) { if (object->pager) { if (vm_pager_has_page(object->pager, object->paging_offset+offset)) { *rtobject = object; *rtoffset = offset; return 1; } } if (!object->shadow) return 0; else { offset += object->shadow_offset; object = object->shadow; } } *rtobject = object; *rtoffset = offset; *rtm = m; return 1; } /* * This routine checks around the requested page for other pages that * might be able to be faulted in. * * Inputs: * first_object, first_offset, m, rbehind, rahead * * Outputs: * marray (array of vm_page_t), reqpage (index of requested page) * * Return value: * number of pages in marray */ int vm_fault_additional_pages(first_object, first_offset, m, rbehind, raheada, marray, reqpage) vm_object_t first_object; vm_offset_t first_offset; vm_page_t m; int rbehind; int raheada; vm_page_t *marray; int *reqpage; { int i; vm_page_t tmpm; vm_object_t object; vm_offset_t offset, startoffset, endoffset, toffset, size; vm_object_t rtobject; vm_page_t rtm; vm_offset_t rtoffset; vm_offset_t offsetdiff; int rahead; int treqpage; object = m->object; offset = m->offset; offsetdiff = offset - first_offset; /* * if the requested page is not available, then give up now */ if (!vm_pager_has_page(object->pager, object->paging_offset+offset)) return 0; /* * if there is no getmulti routine for this pager, then just allow * one page to be read. */ /* if (!object->pager->pg_ops->pgo_getpages) { *reqpage = 0; marray[0] = m; return 1; } */ /* * try to do any readahead that we might have free pages for. */ rahead = raheada; if (rahead > (cnt.v_free_count - cnt.v_free_reserved)) { rahead = cnt.v_free_count - cnt.v_free_reserved; rbehind = 0; } if (cnt.v_free_count < cnt.v_free_min) { if (rahead > VM_FAULT_READ_AHEAD_MIN) rahead = VM_FAULT_READ_AHEAD_MIN; rbehind = 0; } /* * if we don't have any free pages, then just read one page. */ if (rahead <= 0) { *reqpage = 0; marray[0] = m; return 1; } /* * scan backward for the read behind pages -- * in memory or on disk not in same object */ toffset = offset - NBPG; if( rbehind*NBPG > offset) rbehind = offset / NBPG; startoffset = offset - rbehind*NBPG; while (toffset >= startoffset) { if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || rtm != 0 || rtobject != object) { startoffset = toffset + NBPG; break; } if( toffset == 0) break; toffset -= NBPG; } /* * scan forward for the read ahead pages -- * in memory or on disk not in same object */ toffset = offset + NBPG; endoffset = offset + (rahead+1)*NBPG; while (toffset < object->size && toffset < endoffset) { if (!vm_fault_page_lookup(first_object, toffset - offsetdiff, &rtobject, &rtoffset, &rtm) || rtm != 0 || rtobject != object) { break; } toffset += NBPG; } endoffset = toffset; /* calculate number of bytes of pages */ size = (endoffset - startoffset) / NBPG; /* calculate the page offset of the required page */ treqpage = (offset - startoffset) / NBPG; /* see if we have space (again) */ if (cnt.v_free_count >= cnt.v_free_reserved + size) { bzero(marray, (rahead + rbehind + 1) * sizeof(vm_page_t)); /* * get our pages and don't block for them */ for (i = 0; i < size; i++) { if (i != treqpage) rtm = vm_page_alloc(object, startoffset + i * NBPG); else rtm = m; marray[i] = rtm; } for (i = 0; i < size; i++) { if (marray[i] == 0) break; } /* * if we could not get our block of pages, then * free the readahead/readbehind pages. */ if (i < size) { for (i = 0; i < size; i++) { if (i != treqpage && marray[i]) FREE_PAGE(marray[i]); } *reqpage = 0; marray[0] = m; return 1; } *reqpage = treqpage; return size; } *reqpage = 0; marray[0] = m; return 1; } Index: head/sys/vm/vm_kern.c =================================================================== --- head/sys/vm/vm_kern.c (revision 1886) +++ head/sys/vm/vm_kern.c (revision 1887) @@ -1,456 +1,460 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id$ + * $Id: vm_kern.c,v 1.3 1994/08/02 07:55:22 davidg Exp $ */ /* * Kernel memory management. */ #include #include #include #include #include #include /* * kmem_alloc_pageable: * * Allocate pageable memory to the kernel's address map. * map must be "kernel_map" below. */ vm_offset_t kmem_alloc_pageable(map, size) vm_map_t map; register vm_size_t size; { vm_offset_t addr; register int result; #if 0 if (map != kernel_map) panic("kmem_alloc_pageable: not called with kernel_map"); #endif size = round_page(size); addr = vm_map_min(map); result = vm_map_find(map, NULL, (vm_offset_t) 0, &addr, size, TRUE); if (result != KERN_SUCCESS) { return(0); } return(addr); } /* * Allocate wired-down memory in the kernel's address map * or a submap. */ vm_offset_t kmem_alloc(map, size) register vm_map_t map; register vm_size_t size; { vm_offset_t addr; register vm_offset_t offset; extern vm_object_t kernel_object; vm_offset_t i; size = round_page(size); /* * Use the kernel object for wired-down kernel pages. * Assume that no region of the kernel object is * referenced more than once. */ /* * Locate sufficient space in the map. This will give us the * final virtual address for the new memory, and thus will tell * us the offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr)) { vm_map_unlock(map); return (0); } offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kernel_object); vm_map_insert(map, kernel_object, offset, addr, addr + size); vm_map_unlock(map); /* * Guarantee that there are pages already in this object * before calling vm_map_pageable. This is to prevent the * following scenario: * * 1) Threads have swapped out, so that there is a * pager for the kernel_object. * 2) The kmsg zone is empty, and so we are kmem_allocing * a new page for it. * 3) vm_map_pageable calls vm_fault; there is no page, * but there is a pager, so we call * pager_data_request. But the kmsg zone is empty, * so we must kmem_alloc. * 4) goto 1 * 5) Even if the kmsg zone is not empty: when we get * the data back from the pager, it will be (very * stale) non-zero data. kmem_alloc is defined to * return zero-filled memory. * * We're intentionally not activating the pages we allocate * to prevent a race with page-out. vm_map_pageable will wire * the pages. */ vm_object_lock(kernel_object); for (i = 0 ; i < size; i+= PAGE_SIZE) { vm_page_t mem; while ((mem = vm_page_alloc(kernel_object, offset+i)) == NULL) { vm_object_unlock(kernel_object); VM_WAIT; vm_object_lock(kernel_object); } vm_page_zero_fill(mem); mem->flags &= ~PG_BUSY; } vm_object_unlock(kernel_object); /* * And finally, mark the data as non-pageable. */ (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); /* * Try to coalesce the map */ vm_map_simplify(map, addr); return(addr); } /* * kmem_free: * * Release a region of kernel virtual memory allocated * with kmem_alloc, and return the physical pages * associated with that region. */ void kmem_free(map, addr, size) vm_map_t map; register vm_offset_t addr; vm_size_t size; { (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); } /* * kmem_suballoc: * * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * size Size of range to find * min, max Returned endpoints of map * pageable Can the region be paged */ vm_map_t kmem_suballoc(parent, min, max, size, pageable) register vm_map_t parent; vm_offset_t *min, *max; register vm_size_t size; boolean_t pageable; { register int ret; vm_map_t result; size = round_page(size); *min = (vm_offset_t) vm_map_min(parent); ret = vm_map_find(parent, NULL, (vm_offset_t) 0, min, size, TRUE); if (ret != KERN_SUCCESS) { printf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); } *max = *min + size; pmap_reference(vm_map_pmap(parent)); result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable); if (result == NULL) panic("kmem_suballoc: cannot create submap"); if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS) panic("kmem_suballoc: unable to change range to submap"); return(result); } /* * Allocate wired-down memory in the kernel's address map for the higher * level kernel memory allocator (kern/kern_malloc.c). We cannot use * kmem_alloc() because we may need to allocate memory at interrupt * level where we cannot block (canwait == FALSE). * * This routine has its own private kernel submap (kmem_map) and object * (kmem_object). This, combined with the fact that only malloc uses * this routine, ensures that we will never block in map or object waits. * * Note that this still only works in a uni-processor environment and * when called at splhigh(). * * We don't worry about expanding the map (adding entries) since entries * for wired maps are statically allocated. */ vm_offset_t kmem_malloc(map, size, canwait) register vm_map_t map; register vm_size_t size; boolean_t canwait; { register vm_offset_t offset, i; vm_map_entry_t entry; vm_offset_t addr; vm_page_t m; extern vm_object_t kmem_object; if (map != kmem_map && map != mb_map) panic("kern_malloc_alloc: map != {kmem,mb}_map"); size = round_page(size); addr = vm_map_min(map); /* * Locate sufficient space in the map. This will give us the * final virtual address for the new memory, and thus will tell * us the offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr)) { vm_map_unlock(map); #if 0 if (canwait) /* XXX should wait */ panic("kmem_malloc: %s too small", map == kmem_map ? "kmem_map" : "mb_map"); #endif if (canwait) panic("kmem_malloc: map too small"); return (0); } offset = addr - vm_map_min(kmem_map); vm_object_reference(kmem_object); vm_map_insert(map, kmem_object, offset, addr, addr + size); /* * If we can wait, just mark the range as wired * (will fault pages as necessary). */ if (canwait) { vm_map_unlock(map); (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); vm_map_simplify(map, addr); return(addr); } /* * If we cannot wait then we must allocate all memory up front, * pulling it off the active queue to prevent pageout. */ vm_object_lock(kmem_object); for (i = 0; i < size; i += PAGE_SIZE) { m = vm_page_alloc(kmem_object, offset + i); /* * Ran out of space, free everything up and return. * Don't need to lock page queues here as we know * that the pages we got aren't on any queues. */ if (m == NULL) { while (i != 0) { i -= PAGE_SIZE; m = vm_page_lookup(kmem_object, offset + i); vm_page_free(m); } vm_object_unlock(kmem_object); vm_map_delete(map, addr, addr + size); vm_map_unlock(map); return(0); } #if 0 vm_page_zero_fill(m); #endif m->flags &= ~PG_BUSY; } vm_object_unlock(kmem_object); /* * Mark map entry as non-pageable. * Assert: vm_map_insert() will never be able to extend the previous * entry so there will be a new entry exactly corresponding to this * address range and it will have wired_count == 0. */ if (!vm_map_lookup_entry(map, addr, &entry) || entry->start != addr || entry->end != addr + size || entry->wired_count) panic("kmem_malloc: entry not found or misaligned"); entry->wired_count++; /* * Loop thru pages, entering them in the pmap. * (We cannot add them to the wired count without * wrapping the vm_page_queue_lock in splimp...) */ for (i = 0; i < size; i += PAGE_SIZE) { vm_object_lock(kmem_object); m = vm_page_lookup(kmem_object, offset + i); vm_object_unlock(kmem_object); +/* pmap_enter(map->pmap, addr + i, VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, TRUE); +*/ + pmap_kenter( addr + i, VM_PAGE_TO_PHYS(m)); } + pmap_update(); vm_map_unlock(map); vm_map_simplify(map, addr); return(addr); } /* * kmem_alloc_wait * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * */ vm_offset_t kmem_alloc_wait(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; size = round_page(size); for (;;) { /* * To make this work for more than one map, * use the map's lock to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); return (0); } assert_wait((int)map, TRUE); vm_map_unlock(map); thread_block("kmaw"); } vm_map_insert(map, NULL, (vm_offset_t)0, addr, addr + size); vm_map_unlock(map); return (addr); } /* * kmem_free_wakeup * * Returns memory to a submap of the kernel, and wakes up any threads * waiting for memory in that map. */ void kmem_free_wakeup(map, addr, size) vm_map_t map; vm_offset_t addr; vm_size_t size; { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); thread_wakeup((int)map); vm_map_unlock(map); } /* * Create the kernel map; insert a mapping covering kernel text, data, bss, * and all space allocated thus far (`boostrap' data). The new map will thus * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and * the range between `start' and `end' as free. */ void kmem_init(start, end) vm_offset_t start, end; { register vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE); vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_offset_t)0, VM_MIN_KERNEL_ADDRESS, start); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 1886) +++ head/sys/vm/vm_pageout.c (revision 1887) @@ -1,791 +1,792 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.5 1994/08/02 07:55:33 davidg Exp $ + * $Id: vm_pageout.c,v 1.6 1994/08/04 03:06:47 davidg Exp $ */ /* * The proverbial page-out daemon. */ #include #include #include #include #include #include #include #include extern vm_map_t kmem_map; int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pagescanner; /* Event on which pagescanner sleeps */ int vm_pageout_free_min = 0; /* Stop pageout to wait for pagers at this free level */ int vm_pageout_pages_needed = 0; /* flag saying that the pageout daemon needs pages */ int vm_page_pagesfreed; extern int npendingio; extern int hz; int vm_pageout_proc_limit; extern int nswiodone; extern int swap_pager_full; extern int swap_pager_ready(); #define MAXREF 32767 #define MAXSCAN 512 /* maximum number of pages to scan in active queue */ /* set the "clock" hands to be (MAXSCAN * 4096) Bytes */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_MAX 300 #define LOWATER ((2048*1024)/NBPG) #define VM_PAGEOUT_PAGE_COUNT 8 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; static vm_offset_t vm_space_needed; int vm_pageout_req_do_stats; int vm_page_max_wired = 0; /* XXX max # of wired pages system-wide */ /* * vm_pageout_clean: * cleans a vm_page */ int vm_pageout_clean(m, sync) register vm_page_t m; int sync; { /* * Clean the page and remove it from the * laundry. * * We set the busy bit to cause * potential page faults on this page to * block. * * And we set pageout-in-progress to keep * the object from disappearing during * pageout. This guarantees that the * page won't move from the inactive * queue. (However, any other page on * the inactive queue may move!) */ register vm_object_t object; register vm_pager_t pager; int pageout_status[VM_PAGEOUT_PAGE_COUNT]; vm_page_t ms[VM_PAGEOUT_PAGE_COUNT]; int pageout_count; int anyok=0; int i; vm_offset_t offset = m->offset; object = m->object; if (!object) { printf("pager: object missing\n"); return 0; } /* * Try to collapse the object before * making a pager for it. We must * unlock the page queues first. * We try to defer the creation of a pager * until all shadows are not paging. This * allows vm_object_collapse to work better and * helps control swap space size. * (J. Dyson 11 Nov 93) */ if (!object->pager && cnt.v_free_count < vm_pageout_free_min) return 0; if (!object->pager && object->shadow && object->shadow->paging_in_progress) return 0; if( !sync) { if (object->shadow) { vm_object_collapse(object); if (!vm_page_lookup(object, offset)) return 0; } if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { return 0; } } pageout_count = 1; ms[0] = m; if( pager = object->pager) { for(i=1;iflags & (PG_CLEAN|PG_INACTIVE|PG_BUSY)) == PG_INACTIVE) || (( ms[i]->flags & PG_CLEAN) == 0 && sync == VM_PAGEOUT_FORCE)) && (ms[i]->wire_count == 0) && (ms[i]->hold_count == 0)) pageout_count++; else break; } else break; } for(i=0;iflags |= PG_BUSY; pmap_page_protect(VM_PAGE_TO_PHYS(ms[i]), VM_PROT_READ); } object->paging_in_progress += pageout_count; cnt.v_pageouts += pageout_count; } else { m->flags |= PG_BUSY; pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ); cnt.v_pageouts++; object->paging_in_progress++; pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, object->size, VM_PROT_ALL, 0); if (pager != NULL) { vm_object_setpager(object, pager, 0, FALSE); } } /* * If there is no pager for the page, * use the default pager. If there's * no place to put the page at the * moment, leave it in the laundry and * hope that there will be paging space * later. */ if ((pager && pager->pg_type == PG_SWAP) || cnt.v_free_count >= vm_pageout_free_min) { if( pageout_count == 1) { pageout_status[0] = pager ? vm_pager_put(pager, m, ((sync || (object == kernel_object)) ? TRUE: FALSE)) : VM_PAGER_FAIL; } else { if( !pager) { for(i=0;iflags &= ~PG_LAUNDRY; ++anyok; break; case VM_PAGER_PEND: ms[i]->flags &= ~PG_LAUNDRY; ++anyok; break; case VM_PAGER_BAD: /* * Page outside of range of object. * Right now we essentially lose the * changes by pretending it worked. */ ms[i]->flags &= ~PG_LAUNDRY; ms[i]->flags |= PG_CLEAN; pmap_clear_modify(VM_PAGE_TO_PHYS(ms[i])); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then * reactivate the page so it doesn't * clog the inactive list. (We will * try paging out it again later). */ if (ms[i]->flags & PG_INACTIVE) vm_page_activate(ms[i]); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave * the page busy to block all other accesses. * Also, leave the paging in progress * indicator set so that we don't attempt an * object collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { PAGE_WAKEUP(ms[i]); if (--object->paging_in_progress == 0) wakeup((caddr_t) object); if (pmap_is_referenced(VM_PAGE_TO_PHYS(ms[i]))) { pmap_clear_reference(VM_PAGE_TO_PHYS(ms[i])); if( ms[i]->flags & PG_INACTIVE) vm_page_activate(ms[i]); } } } return anyok; } /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * shadows. * * The object and map must be locked. */ int vm_pageout_object_deactivate_pages(map, object, count) vm_map_t map; vm_object_t object; int count; { register vm_page_t p, next; int rcount; int s; int dcount; dcount = 0; if (count == 0) count = 1; if (object->shadow) { int scount = count; if( object->shadow->ref_count > 1) scount /= object->shadow->ref_count; if( scount) dcount += vm_pageout_object_deactivate_pages(map, object->shadow, scount); } if (object->paging_in_progress) return dcount; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = object->memq.tqh_first; while (p && (rcount-- > 0)) { next = p->listq.tqe_next; vm_page_lock_queues(); /* * if a page is active, not wired and is in the processes pmap, * then deactivate the page. */ if ((p->flags & (PG_ACTIVE|PG_BUSY)) == PG_ACTIVE && p->wire_count == 0 && p->hold_count == 0 && pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p))) { p->act_count -= min(p->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we deactivate */ if (!p->act_count) { vm_page_deactivate(p); pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); /* * else if on the next go-around we will deactivate the page * we need to place the page on the end of the queue to age * the other pages in memory. */ } else { TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); TAILQ_REMOVE(&object->memq, p, listq); TAILQ_INSERT_TAIL(&object->memq, p, listq); } /* * see if we are done yet */ if (p->flags & PG_INACTIVE) { --count; ++dcount; if (count <= 0 && cnt.v_inactive_count > cnt.v_inactive_target) { vm_page_unlock_queues(); return dcount; } } } else { /* * Move the page to the bottom of the queue. */ pmap_clear_reference(VM_PAGE_TO_PHYS(p)); if (p->act_count < ACT_MAX) p->act_count += ACT_ADVANCE; TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); TAILQ_REMOVE(&object->memq, p, listq); TAILQ_INSERT_TAIL(&object->memq, p, listq); } } vm_page_unlock_queues(); p = next; } return dcount; } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ void vm_pageout_map_deactivate_pages(map, entry, count, freeer) vm_map_t map; vm_map_entry_t entry; int *count; int (*freeer)(vm_map_t, vm_object_t, int); { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (*count <= 0) return; vm_map_reference(map); if (!lock_try_read(&map->lock)) { vm_map_deallocate(map); return; } if (entry == 0) { tmpe = map->header.next; while (tmpe != &map->header && *count > 0) { vm_pageout_map_deactivate_pages(map, tmpe, count, freeer); tmpe = tmpe->next; }; } else if (entry->is_sub_map || entry->is_a_map) { tmpm = entry->object.share_map; tmpe = tmpm->header.next; while (tmpe != &tmpm->header && *count > 0) { vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer); tmpe = tmpe->next; }; } else if (obj = entry->object.vm_object) { *count -= (*freeer)(map, obj, *count); } lock_read_done(&map->lock); vm_map_deallocate(map); return; } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ int vm_pageout_scan() { vm_page_t m; int page_shortage, maxscan, maxlaunder; int pages_freed, free, nproc; int desired_free; vm_page_t next; struct proc *p; vm_object_t object; int s; int force_wakeup = 0; morefree: /* * scan the processes for exceeding their rlimits or if process * is swapped out -- deactivate pages */ rescanproc1: for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { vm_offset_t size; int overage; vm_offset_t limit; /* * if this is a system process or if we have already * looked at this process, skip it. */ if (p->p_flag & (P_SYSTEM|P_WEXIT)) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get a limit */ limit = min(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max); /* * let processes that are swapped out really be swapped out * set the limit to nothing (will force a swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; size = p->p_vmspace->vm_pmap.pm_stats.resident_count * NBPG; if (size >= limit) { overage = (size - limit) / NBPG; vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages); } } if (((cnt.v_free_count + cnt.v_inactive_count) >= (cnt.v_inactive_target + cnt.v_free_target)) && (cnt.v_free_count >= cnt.v_free_target)) return force_wakeup; pages_freed = 0; desired_free = cnt.v_free_target; /* * Start scanning the inactive queue for pages we can free. * We keep scanning until we have enough free pages or * we have scanned through the entire queue. If we * encounter dirty pages, we start cleaning them. */ maxlaunder = (cnt.v_free_target - cnt.v_free_count); maxscan = cnt.v_inactive_count; rescan1: m = vm_page_queue_inactive.tqh_first; while (m && (maxscan-- > 0) && (cnt.v_free_count < desired_free) ) { vm_page_t next; next = m->pageq.tqe_next; if( (m->flags & PG_INACTIVE) == 0) { printf("vm_pageout_scan: page not inactive?"); continue; } /* * activate held pages */ if (m->hold_count != 0) { vm_page_activate(m); m = next; continue; } /* * dont mess with busy pages */ if (m->flags & PG_BUSY) { m = next; continue; } /* * if page is clean and but the page has been referenced, * then reactivate the page, but if we are very low on memory * or the page has not been referenced, then we free it to the * vm system. */ if (m->flags & PG_CLEAN) { if ((cnt.v_free_count > vm_pageout_free_min) /* XXX */ && pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { vm_page_activate(m); } else if (!m->act_count) { pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); ++pages_freed; } else { m->act_count -= min(m->act_count, ACT_DECLINE); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); } } else if ((m->flags & PG_LAUNDRY) && maxlaunder > 0) { int written; if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); m = next; continue; } /* * If a page is dirty, then it is either * being washed (but not yet cleaned) * or it is still in the laundry. If it is * still in the laundry, then we start the * cleaning operation. */ if (written = vm_pageout_clean(m,0)) { maxlaunder -= written; } + if (!next) + break; /* * if the next page has been re-activated, start scanning again */ - if (!next || (next->flags & PG_INACTIVE) == 0) + if ((next->flags & PG_INACTIVE) == 0) goto rescan1; } else if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); } m = next; } /* * now check malloc area or swap processes out if we are in low * memory conditions */ if (cnt.v_free_count <= cnt.v_free_min) { /* * swap out inactive processes */ swapout_threads(); } /* * Compute the page shortage. If we are still very low on memory * be sure that we will move a minimal amount of pages from active * to inactive. */ page_shortage = cnt.v_inactive_target - (cnt.v_free_count + cnt.v_inactive_count); if (page_shortage <= 0) { if (pages_freed == 0) { if( cnt.v_free_count < cnt.v_free_min) { page_shortage = cnt.v_free_min - cnt.v_free_count; } else if(((cnt.v_free_count + cnt.v_inactive_count) < (cnt.v_free_min + cnt.v_inactive_target))) { page_shortage = 1; } else { page_shortage = 0; } } } maxscan = cnt.v_active_count; m = vm_page_queue_active.tqh_first; while (m && maxscan-- && (page_shortage > 0)) { next = m->pageq.tqe_next; /* * Don't deactivate pages that are busy. */ if ((m->flags & PG_BUSY) || (m->hold_count != 0)) { m = next; continue; } if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); if (m->act_count < ACT_MAX) m->act_count += ACT_ADVANCE; TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); TAILQ_REMOVE(&m->object->memq, m, listq); TAILQ_INSERT_TAIL(&m->object->memq, m, listq); } else { m->act_count -= min(m->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we deactivate */ if (!m->act_count) { vm_page_deactivate(m); --page_shortage; /* * else if on the next go-around we will deactivate the page * we need to place the page on the end of the queue to age * the other pages in memory. */ } else { TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); TAILQ_REMOVE(&m->object->memq, m, listq); TAILQ_INSERT_TAIL(&m->object->memq, m, listq); } } - m = next; } /* * if we have not freed any pages and we are desparate for memory * then we keep trying until we get some (any) memory. */ if( !force_wakeup && (swap_pager_full || !force_wakeup || (pages_freed == 0 && (cnt.v_free_count < cnt.v_free_min)))){ vm_pager_sync(); force_wakeup = 1; goto morefree; } vm_page_pagesfreed += pages_freed; return force_wakeup; } /* * vm_pageout is the high level pageout daemon. */ void vm_pageout() { extern npendingio, swiopend; static nowakeup; (void) spl0(); /* * Initialize some paging parameters. */ vmretry: cnt.v_free_min = 12; cnt.v_free_reserved = 8; if (cnt.v_free_min < 8) cnt.v_free_min = 8; if (cnt.v_free_min > 32) cnt.v_free_min = 32; vm_pageout_free_min = 4; cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; cnt.v_inactive_target = cnt.v_free_count / 12; cnt.v_free_min += cnt.v_free_reserved; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; (void) swap_pager_alloc(0, 0, 0, 0); /* * The pageout daemon is never done, so loop * forever. */ while (TRUE) { int force_wakeup; extern struct loadavg averunnable; /* cnt.v_free_min = 12 + averunnable.ldavg[0] / 1024; cnt.v_free_target = 2*cnt.v_free_min + cnt.v_free_reserved; cnt.v_inactive_target = cnt.v_free_target*2; */ tsleep((caddr_t) &vm_pages_needed, PVM, "psleep", 0); vm_pager_sync(); /* * The force wakeup hack added to eliminate delays and potiential * deadlock. It was possible for the page daemon to indefintely * postpone waking up a process that it might be waiting for memory * on. The putmulti stuff seems to have aggravated the situation. */ force_wakeup = vm_pageout_scan(); vm_pager_sync(); if( force_wakeup) wakeup( (caddr_t) &cnt.v_free_count); cnt.v_scan++; wakeup((caddr_t) kmem_map); } } Index: head/sys/vm/vm_pager.c =================================================================== --- head/sys/vm/vm_pager.c (revision 1886) +++ head/sys/vm/vm_pager.c (revision 1887) @@ -1,324 +1,426 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id$ + * $Id: vm_pager.c,v 1.3 1994/08/02 07:55:35 davidg Exp $ */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include #include #include +#include +#include #include #include #include extern struct pagerops swappagerops; extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; struct pagerops *pagertab[] = { &swappagerops, /* PG_SWAP */ &vnodepagerops, /* PG_VNODE */ &devicepagerops, /* PG_DEV */ }; int npagers = sizeof (pagertab) / sizeof (pagertab[0]); struct pagerops *dfltpagerops = NULL; /* default pager */ /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ -#define PAGER_MAP_SIZE (4 * 1024 * 1024) +#define PAGER_MAP_SIZE (8 * 1024 * 1024) int pager_map_size = PAGER_MAP_SIZE; vm_map_t pager_map; boolean_t pager_map_wanted; vm_offset_t pager_sva, pager_eva; +int bswneeded; +vm_offset_t swapbkva; /* swap buffers kva */ void vm_pager_init() { struct pagerops **pgops; /* - * Allocate a kernel submap for tracking get/put page mappings - */ -/* - pager_map = kmem_suballoc(kernel_map, &pager_sva, &pager_eva, - PAGER_MAP_SIZE, FALSE); -*/ - /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops) (*(*pgops)->pgo_init)(); if (dfltpagerops == NULL) panic("no default pager"); } +void +vm_pager_bufferinit() +{ + struct buf *bp; + int i; + bp = swbuf; + /* + * Now set up swap and physical I/O buffer headers. + */ + for (i = 0; i < nswbuf - 1; i++, bp++) { + TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); + bp->b_rcred = bp->b_wcred = NOCRED; + bp->b_vnbufs.le_next = NOLIST; + } + bp->b_rcred = bp->b_wcred = NOCRED; + bp->b_vnbufs.le_next = NOLIST; + bp->b_actf = NULL; + + swapbkva = kmem_alloc_pageable( pager_map, nswbuf * MAXPHYS); + if( !swapbkva) + panic("Not enough pager_map VM space for physical buffers"); +} + /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_pager_t vm_pager_allocate(type, handle, size, prot, off) int type; caddr_t handle; vm_size_t size; vm_prot_t prot; vm_offset_t off; { struct pagerops *ops; ops = (type == PG_DFLT) ? dfltpagerops : pagertab[type]; if (ops) return ((*ops->pgo_alloc)(handle, size, prot, off)); return (NULL); } void vm_pager_deallocate(pager) vm_pager_t pager; { if (pager == NULL) panic("vm_pager_deallocate: null pager"); (*pager->pg_ops->pgo_dealloc)(pager); } int vm_pager_get_pages(pager, m, count, reqpage, sync) vm_pager_t pager; vm_page_t *m; int count; int reqpage; boolean_t sync; { extern boolean_t vm_page_zero_fill(); extern int vm_pageout_count; int i; if (pager == NULL) { for (i=0;ipg_ops->pgo_getpages == 0) { for(i=0;ipg_ops->pgo_putpages) return(VM_PAGER_PUT_MULTI(pager, m, count, sync, rtvals)); else { for(i=0;ipg_ops->pgo_haspage)(pager, offset)); } /* * Called by pageout daemon before going back to sleep. * Gives pagers a chance to clean up any completed async pageing operations. */ void vm_pager_sync() { struct pagerops **pgops; for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops) (*(*pgops)->pgo_putpage)(NULL, NULL, 0); } #if 0 void vm_pager_cluster(pager, offset, loff, hoff) vm_pager_t pager; vm_offset_t offset; vm_offset_t *loff; vm_offset_t *hoff; { if (pager == NULL) panic("vm_pager_cluster: null pager"); return ((*pager->pg_ops->pgo_cluster)(pager, offset, loff, hoff)); } #endif vm_offset_t vm_pager_map_page(m) vm_page_t m; { vm_offset_t kva; kva = kmem_alloc_wait(pager_map, PAGE_SIZE); pmap_enter(vm_map_pmap(pager_map), kva, VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, TRUE); return(kva); } void vm_pager_unmap_page(kva) vm_offset_t kva; { kmem_free_wakeup(pager_map, kva, PAGE_SIZE); } vm_page_t vm_pager_atop(kva) vm_offset_t kva; { vm_offset_t pa; pa = pmap_extract(vm_map_pmap(pager_map), kva); if (pa == 0) panic("vm_pager_atop"); return (PHYS_TO_VM_PAGE(pa)); } vm_pager_t vm_pager_lookup(pglist, handle) register struct pagerlst *pglist; caddr_t handle; { register vm_pager_t pager; for (pager = pglist->tqh_first; pager; pager = pager->pg_list.tqe_next) if (pager->pg_handle == handle) return (pager); return (NULL); } /* * This routine gains a reference to the object. * Explicit deallocation is necessary. */ int pager_cache(object, should_cache) vm_object_t object; boolean_t should_cache; { if (object == NULL) return (KERN_INVALID_ARGUMENT); vm_object_cache_lock(); vm_object_lock(object); if (should_cache) object->flags |= OBJ_CANPERSIST; else object->flags &= ~OBJ_CANPERSIST; vm_object_unlock(object); vm_object_cache_unlock(); vm_object_deallocate(object); return (KERN_SUCCESS); } + +/* + * allocate a physical buffer + */ +struct buf * +getpbuf() { + int s; + struct buf *bp; + + s = splbio(); + /* get a bp from the swap buffer header pool */ +tryagain: + while ((bp = bswlist.tqh_first) == NULL) { + bswneeded = 1; + tsleep((caddr_t)&bswneeded, PVM, "wswbuf", 0); + } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + splx(s); + + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_data = (caddr_t) (MAXPHYS * (bp-swbuf)) + swapbkva; + return bp; +} + +/* + * allocate a physical buffer, if one is available + */ +struct buf * +trypbuf() { + int s; + struct buf *bp; + + s = splbio(); + if ((bp = bswlist.tqh_first) == NULL) { + splx(s); + return NULL; + } + TAILQ_REMOVE(&bswlist, bp, b_freelist); + splx(s); + + bzero(bp, sizeof *bp); + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_data = (caddr_t) (MAXPHYS * (bp-swbuf)) + swapbkva; + return bp; +} + +/* + * release a physical buffer + */ +void +relpbuf(bp) + struct buf *bp; +{ + int s; + + s = splbio(); + + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + + if (bp->b_vp) + brelvp(bp); + + TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); + + if (bswneeded) { + bswneeded = 0; + wakeup((caddr_t)&bswlist); + } + splx(s); +} + + Index: head/sys/vm/vm_swap.c =================================================================== --- head/sys/vm/vm_swap.c (revision 1886) +++ head/sys/vm/vm_swap.c (revision 1887) @@ -1,440 +1,430 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 - * $Id$ + * $Id: vm_swap.c,v 1.3 1994/08/02 07:55:40 davidg Exp $ */ #include #include #include #include #include #include #include /* XXX */ #include #include #include #include /* * Indirect driver for multi-controller paging. */ int nswap, nswdev; int vm_swap_size; #ifdef SEQSWAP int niswdev; /* number of interleaved swap devices */ int niswap; /* size of interleaved swap area */ #endif +int bswneeded; +vm_offset_t swapbkva; /* swap buffers kva */ /* * Set up swap devices. * Initialize linked list of free swap * headers. These do not actually point * to buffers, but rather to pages that * are being swapped in and out. */ void swapinit() { register int i; register struct buf *sp = swbuf; register struct proc *p = &proc0; /* XXX */ struct swdevt *swp; int error; /* * Count swap devices, and adjust total swap space available. * Some of the space will not be countable until later (dynamically * configurable devices) and some of the counted space will not be * available until a swapon() system call is issued, both usually * happen when the system goes multi-user. * * If using NFS for swap, swdevt[0] will already be bdevvp'd. XXX */ #ifdef SEQSWAP nswdev = niswdev = 0; nswap = niswap = 0; /* * All interleaved devices must come first */ for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { if (swp->sw_flags & SW_SEQUENTIAL) break; niswdev++; if (swp->sw_nblks > niswap) niswap = swp->sw_nblks; } niswap = roundup(niswap, dmmax); niswap *= niswdev; if (swdevt[0].sw_vp == NULL && bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) panic("swapvp"); /* * The remainder must be sequential */ for ( ; swp->sw_dev != NODEV; swp++) { if ((swp->sw_flags & SW_SEQUENTIAL) == 0) panic("binit: mis-ordered swap devices"); nswdev++; if (swp->sw_nblks > 0) { if (swp->sw_nblks % dmmax) swp->sw_nblks -= (swp->sw_nblks % dmmax); nswap += swp->sw_nblks; } } nswdev += niswdev; if (nswdev == 0) panic("swapinit"); nswap += niswap; #else nswdev = 0; nswap = 0; for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { nswdev++; if (swp->sw_nblks > nswap) nswap = swp->sw_nblks; } if (nswdev == 0) panic("swapinit"); if (nswdev > 1) nswap = ((nswap + dmmax - 1) / dmmax) * dmmax; nswap *= nswdev; if (swdevt[0].sw_vp == NULL && bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) panic("swapvp"); #endif if (nswap == 0) printf("WARNING: no swap space found\n"); else if (error = swfree(p, 0)) { printf("swfree errno %d\n", error); /* XXX */ panic("swapinit swfree 0"); } - - /* - * Now set up swap buffer headers. - */ - for (i = 0; i < nswbuf - 1; i++, sp++) { - TAILQ_INSERT_HEAD(&bswlist, sp, b_freelist); - sp->b_rcred = sp->b_wcred = p->p_ucred; - sp->b_vnbufs.le_next = NOLIST; - } - sp->b_rcred = sp->b_wcred = p->p_ucred; - sp->b_vnbufs.le_next = NOLIST; - sp->b_actf = NULL; } void swstrategy(bp) register struct buf *bp; { int sz, off, seg, index; register struct swdevt *sp; struct vnode *vp; #ifdef GENERIC /* * A mini-root gets copied into the front of the swap * and we run over top of the swap area just long * enough for us to do a mkfs and restor of the real * root (sure beats rewriting standalone restor). */ #define MINIROOTSIZE 4096 if (rootdev == dumpdev) bp->b_blkno += MINIROOTSIZE; #endif sz = howmany(bp->b_bcount, DEV_BSIZE); if (bp->b_blkno + sz > nswap) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } if (nswdev > 1) { #ifdef SEQSWAP if (bp->b_blkno < niswap) { if (niswdev > 1) { off = bp->b_blkno % dmmax; if (off+sz > dmmax) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } seg = bp->b_blkno / dmmax; index = seg % niswdev; seg /= niswdev; bp->b_blkno = seg*dmmax + off; } else index = 0; } else { register struct swdevt *swp; bp->b_blkno -= niswap; for (index = niswdev, swp = &swdevt[niswdev]; swp->sw_dev != NODEV; swp++, index++) { if (bp->b_blkno < swp->sw_nblks) break; bp->b_blkno -= swp->sw_nblks; } if (swp->sw_dev == NODEV || bp->b_blkno+sz > swp->sw_nblks) { bp->b_error = swp->sw_dev == NODEV ? ENODEV : EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } } #else off = bp->b_blkno % dmmax; if (off+sz > dmmax) { bp->b_error = EINVAL; bp->b_flags |= B_ERROR; biodone(bp); return; } seg = bp->b_blkno / dmmax; index = seg % nswdev; seg /= nswdev; bp->b_blkno = seg*dmmax + off; #endif } else index = 0; sp = &swdevt[index]; if ((bp->b_dev = sp->sw_dev) == NODEV) panic("swstrategy"); if (sp->sw_vp == NULL) { bp->b_error = ENODEV; bp->b_flags |= B_ERROR; biodone(bp); return; } VHOLD(sp->sw_vp); if ((bp->b_flags & B_READ) == 0) { if (vp = bp->b_vp) { vp->v_numoutput--; if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t)&vp->v_numoutput); } } sp->sw_vp->v_numoutput++; } if (bp->b_vp != NULL) brelvp(bp); bp->b_vp = sp->sw_vp; VOP_STRATEGY(bp); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ struct swapon_args { char *name; }; /* ARGSUSED */ int swapon(p, uap, retval) struct proc *p; struct swapon_args *uap; int *retval; { register struct vnode *vp; register struct swdevt *sp; dev_t dev; int error; struct nameidata nd; if (error = suser(p->p_ucred, &p->p_acflag)) return (error); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, p); if (error = namei(&nd)) return (error); vp = nd.ni_vp; if (vp->v_type != VBLK) { vrele(vp); return (ENOTBLK); } dev = (dev_t)vp->v_rdev; if (major(dev) >= nblkdev) { vrele(vp); return (ENXIO); } for (sp = &swdevt[0]; sp->sw_dev != NODEV; sp++) { if (sp->sw_dev == dev) { if (sp->sw_flags & SW_FREED) { vrele(vp); return (EBUSY); } sp->sw_vp = vp; if (error = swfree(p, sp - swdevt)) { vrele(vp); return (error); } return (0); } #ifdef SEQSWAP /* * If we have reached a non-freed sequential device without * finding what we are looking for, it is an error. * That is because all interleaved devices must come first * and sequential devices must be freed in order. */ if ((sp->sw_flags & (SW_SEQUENTIAL|SW_FREED)) == SW_SEQUENTIAL) break; #endif } vrele(vp); return (EINVAL); } /* * Swfree(index) frees the index'th portion of the swap map. * Each of the nswdev devices provides 1/nswdev'th of the swap * space, which is laid out with blocks of dmmax pages circularly * among the devices. */ int swfree(p, index) struct proc *p; int index; { register struct swdevt *sp; register swblk_t vsbase; register long blk; struct vnode *vp; register swblk_t dvbase; register int nblks; int error; sp = &swdevt[index]; vp = sp->sw_vp; if (error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)) return (error); sp->sw_flags |= SW_FREED; nblks = sp->sw_nblks; /* * Some devices may not exist til after boot time. * If so, their nblk count will be 0. */ if (nblks <= 0) { int perdev; dev_t dev = sp->sw_dev; if (bdevsw[major(dev)].d_psize == 0 || (nblks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); sp->sw_flags &= ~SW_FREED; return (ENXIO); } #ifdef SEQSWAP if (index < niswdev) { perdev = niswap / niswdev; if (nblks > perdev) nblks = perdev; } else { if (nblks % dmmax) nblks -= (nblks % dmmax); nswap += nblks; } #else perdev = nswap / nswdev; if (nblks > perdev) nblks = perdev; #endif sp->sw_nblks = nblks; } if (nblks == 0) { (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); sp->sw_flags &= ~SW_FREED; return (0); /* XXX error? */ } #ifdef SEQSWAP if (sp->sw_flags & SW_SEQUENTIAL) { register struct swdevt *swp; blk = niswap; for (swp = &swdevt[niswdev]; swp != sp; swp++) blk += swp->sw_nblks; #if 0 rmfree(swapmap, nblks, blk); return (0); #endif rlist_free(&swapmap, blk, blk + nblks - 1); vm_swap_size += nblks; return (0); } #endif for (dvbase = 0; dvbase < nblks; dvbase += dmmax) { blk = nblks - dvbase; #ifdef SEQSWAP if ((vsbase = index*dmmax + dvbase*niswdev) >= niswap) panic("swfree"); #else if ((vsbase = index*dmmax + dvbase*nswdev) >= nswap) panic("swfree"); #endif if (blk > dmmax) blk = dmmax; #if 0 if (vsbase == 0) { /* * First of all chunks... initialize the swapmap. * Don't use the first cluster of the device * in case it starts with a label or boot block. */ rminit(swapmap, blk - ctod(CLSIZE), vsbase + ctod(CLSIZE), "swap", nswapmap); } else if (dvbase == 0) { /* * Don't use the first cluster of the device * in case it starts with a label or boot block. */ rmfree(swapmap, blk - ctod(CLSIZE), vsbase + ctod(CLSIZE)); } else rmfree(swapmap, blk, vsbase); #endif /* XXX -- we need to exclude the first cluster as above */ /* but for now, this will work fine... */ rlist_free(&swapmap, vsbase, vsbase + blk - 1); vm_swap_size += blk; } return (0); } Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 1886) +++ head/sys/vm/vnode_pager.c (revision 1887) @@ -1,1438 +1,1474 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993,1994 John S. Dyson * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.2 1994/05/25 09:21:11 rgrimes Exp $ + * $Id: vnode_pager.c,v 1.3 1994/08/04 03:06:48 davidg Exp $ */ /* * Page to/from files (vnodes). * * TODO: * pageouts * fix credential use (uses current process credentials now) */ /* * MODIFICATIONS: * John S. Dyson 08 Dec 93 * * This file in conjunction with some vm_fault mods, eliminate the performance * advantage for using the buffer cache and minimize memory copies. * * 1) Supports multiple - block reads * 2) Bypasses buffer cache for reads * * TODO: * * 1) Totally bypass buffer cache for reads * (Currently will still sometimes use buffer cache for reads) * 2) Bypass buffer cache for writes * (Code does not support it, but mods are simple) */ #include #include #include #include #include #include #include #include #include #include #include #include int vnode_pager_putmulti(); void vnode_pager_init(); vm_pager_t vnode_pager_alloc(caddr_t, vm_offset_t, vm_prot_t, vm_offset_t); void vnode_pager_dealloc(); int vnode_pager_getpage(); int vnode_pager_getmulti(); int vnode_pager_putpage(); boolean_t vnode_pager_haspage(); struct pagerops vnodepagerops = { vnode_pager_init, vnode_pager_alloc, vnode_pager_dealloc, vnode_pager_getpage, vnode_pager_getmulti, vnode_pager_putpage, vnode_pager_putmulti, vnode_pager_haspage }; + + static int vnode_pager_input(vn_pager_t vnp, vm_page_t * m, int count, int reqpage); static int vnode_pager_output(vn_pager_t vnp, vm_page_t * m, int count, int *rtvals); struct buf * getpbuf(); void relpbuf(struct buf * bp); extern vm_map_t pager_map; struct pagerlst vnode_pager_list; /* list of managed vnodes */ #define MAXBP (PAGE_SIZE/DEV_BSIZE); void vnode_pager_init() { TAILQ_INIT(&vnode_pager_list); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_pager_t vnode_pager_alloc(handle, size, prot, offset) caddr_t handle; vm_size_t size; vm_prot_t prot; vm_offset_t offset; { register vm_pager_t pager; register vn_pager_t vnp; vm_object_t object; struct vattr vattr; struct vnode *vp; struct proc *p = curproc; /* XXX */ /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); /* * Vnodes keep a pointer to any associated pager so no need to lookup * with vm_pager_lookup. */ vp = (struct vnode *) handle; pager = (vm_pager_t) vp->v_vmdata; if (pager == NULL) { /* * Allocate pager structures */ pager = (vm_pager_t) malloc(sizeof *pager, M_VMPAGER, M_WAITOK); if (pager == NULL) return (NULL); vnp = (vn_pager_t) malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); if (vnp == NULL) { free((caddr_t) pager, M_VMPAGER); return (NULL); } /* * And an object of the appropriate size */ if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) { object = vm_object_allocate(round_page(vattr.va_size)); vm_object_enter(object, pager); vm_object_setpager(object, pager, 0, TRUE); } else { free((caddr_t) vnp, M_VMPGDATA); free((caddr_t) pager, M_VMPAGER); return (NULL); } /* * Hold a reference to the vnode and initialize pager data. */ VREF(vp); vnp->vnp_flags = 0; vnp->vnp_vp = vp; vnp->vnp_size = vattr.va_size; TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); pager->pg_handle = handle; pager->pg_type = PG_VNODE; pager->pg_ops = &vnodepagerops; pager->pg_data = (caddr_t) vnp; vp->v_vmdata = (caddr_t) pager; } else { /* * vm_object_lookup() will remove the object from the cache if * found and also gain a reference to the object. */ object = vm_object_lookup(pager); } return (pager); } void vnode_pager_dealloc(pager) vm_pager_t pager; { register vn_pager_t vnp = (vn_pager_t) pager->pg_data; register struct vnode *vp; struct proc *p = curproc; /* XXX */ if (vp = vnp->vnp_vp) { vp->v_vmdata = NULL; vp->v_flag &= ~VTEXT; #if 0 /* can hang if done at reboot on NFS FS */ (void) VOP_FSYNC(vp, p->p_ucred, p); #endif vrele(vp); } TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); free((caddr_t) vnp, M_VMPGDATA); free((caddr_t) pager, M_VMPAGER); } int vnode_pager_getmulti(pager, m, count, reqpage, sync) vm_pager_t pager; vm_page_t *m; int count; int reqpage; boolean_t sync; { return vnode_pager_input((vn_pager_t) pager->pg_data, m, count, reqpage); } int vnode_pager_getpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { int err; vm_page_t marray[1]; if (pager == NULL) return FALSE; marray[0] = m; return vnode_pager_input((vn_pager_t) pager->pg_data, marray, 1, 0); } boolean_t vnode_pager_putpage(pager, m, sync) vm_pager_t pager; vm_page_t m; boolean_t sync; { int err; vm_page_t marray[1]; int rtvals[1]; if (pager == NULL) return FALSE; marray[0] = m; vnode_pager_output((vn_pager_t) pager->pg_data, marray, 1, rtvals); return rtvals[0]; } int vnode_pager_putmulti(pager, m, c, sync, rtvals) vm_pager_t pager; vm_page_t *m; int c; boolean_t sync; int *rtvals; { return vnode_pager_output((vn_pager_t) pager->pg_data, m, c, rtvals); } boolean_t vnode_pager_haspage(pager, offset) vm_pager_t pager; vm_offset_t offset; { register vn_pager_t vnp = (vn_pager_t) pager->pg_data; daddr_t bn; int err; /* * Offset beyond end of file, do not have the page */ if (offset >= vnp->vnp_size) { return (FALSE); } /* * Read the index to find the disk block to read from. If there is no * block, report that we don't have this data. * * Assumes that the vnode has whole page or nothing. */ err = VOP_BMAP(vnp->vnp_vp, offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, (struct vnode **) 0, &bn, 0); if (err) { return (TRUE); } return ((long) bn < 0 ? FALSE : TRUE); } /* * Lets the VM system know about a change in size for a file. * If this vnode is mapped into some address space (i.e. we have a pager * for it) we adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(vp, nsize) struct vnode *vp; u_long nsize; { register vn_pager_t vnp; register vm_object_t object; vm_pager_t pager; /* * Not a mapped vnode */ if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL) return; /* * Hasn't changed size */ pager = (vm_pager_t) vp->v_vmdata; vnp = (vn_pager_t) pager->pg_data; if (nsize == vnp->vnp_size) return; /* * No object. This can happen during object termination since * vm_object_page_clean is called after the object has been removed * from the hash table, and clean may cause vnode write operations * which can wind up back here. */ object = vm_object_lookup(pager); if (object == NULL) return; /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nsize < vnp->vnp_size) { vm_object_lock(object); vm_object_page_remove(object, round_page((vm_offset_t) nsize), vnp->vnp_size); vm_object_unlock(object); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode... */ if (nsize & PAGE_MASK) { vm_offset_t kva; vm_page_t m; m = vm_page_lookup(object, trunc_page((vm_offset_t) nsize)); if (m) { kva = vm_pager_map_page(m); bzero((caddr_t) kva + (nsize & PAGE_MASK), round_page(nsize) - nsize); vm_pager_unmap_page(kva); } } } else { /* * this allows the filesystem and VM cache to stay in sync if * the VM page hasn't been modified... After the page is * removed -- it will be faulted back in from the filesystem * cache. */ if (vnp->vnp_size & PAGE_MASK) { vm_page_t m; m = vm_page_lookup(object, trunc_page(vnp->vnp_size)); if (m && (m->flags & PG_CLEAN)) { vm_object_lock(object); vm_object_page_remove(object, vnp->vnp_size, vnp->vnp_size); vm_object_unlock(object); } } } vnp->vnp_size = (vm_offset_t) nsize; object->size = round_page(nsize); vm_object_deallocate(object); } void vnode_pager_umount(mp) register struct mount *mp; { register vm_pager_t pager, npager; struct vnode *vp; pager = vnode_pager_list.tqh_first; while (pager) { /* * Save the next pointer now since uncaching may terminate the * object and render pager invalid */ vp = ((vn_pager_t) pager->pg_data)->vnp_vp; npager = pager->pg_list.tqe_next; if (mp == (struct mount *) 0 || vp->v_mount == mp) (void) vnode_pager_uncache(vp); pager = npager; } } /* * Remove vnode associated object from the object cache. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ boolean_t vnode_pager_uncache(vp) register struct vnode *vp; { register vm_object_t object; boolean_t uncached, locked; vm_pager_t pager; /* * Not a mapped vnode */ pager = (vm_pager_t) vp->v_vmdata; if (pager == NULL) return (TRUE); /* * Unlock the vnode if it is currently locked. We do this since * uncaching the object may result in its destruction which may * initiate paging activity which may necessitate locking the vnode. */ locked = VOP_ISLOCKED(vp); if (locked) VOP_UNLOCK(vp); /* * Must use vm_object_lookup() as it actually removes the object from * the cache list. */ object = vm_object_lookup(pager); if (object) { uncached = (object->ref_count <= 1); pager_cache(object, FALSE); } else uncached = TRUE; if (locked) VOP_LOCK(vp); return (uncached); } void vnode_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * calculate the linear (byte) disk address of specified virtual * file address */ vm_offset_t vnode_pager_addr(vp, address) struct vnode *vp; vm_offset_t address; { int rtaddress; int bsize; vm_offset_t block; struct vnode *rtvp; int err; int vblock, voffset; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, &rtvp, &block, 0); if (err) rtaddress = -1; else rtaddress = block * DEV_BSIZE + voffset; return rtaddress; } /* * interrupt routine for I/O completion */ void vnode_pager_iodone(bp) struct buf *bp; { + int s = splbio(); bp->b_flags |= B_DONE; wakeup((caddr_t) bp); + if( bp->b_flags & B_ASYNC) { + vm_offset_t paddr; + vm_page_t m; + vm_object_t obj = 0; + int i; + int npages; + + paddr = (vm_offset_t) bp->b_data; + if( bp->b_bufsize != bp->b_bcount) + bzero( bp->b_data + bp->b_bcount, + bp->b_bufsize - bp->b_bcount); + + npages = (bp->b_bufsize + PAGE_SIZE - 1) / PAGE_SIZE; + for( i = 0; i < npages; i++) { + m = PHYS_TO_VM_PAGE(pmap_kextract(paddr + i * PAGE_SIZE)); + obj = m->object; + if( m) { + m->flags |= PG_CLEAN; + m->flags &= ~(PG_LAUNDRY|PG_FAKE); + PAGE_WAKEUP(m); + } else { + panic("vnode_pager_iodone: page is gone!!!"); + } + } + if( obj) { + --obj->paging_in_progress; + if( obj->paging_in_progress == 0) + wakeup((caddr_t) obj); + } else { + panic("vnode_pager_iodone: object is gone???"); + } + HOLDRELE(bp->b_vp); + splx(s); + relpbuf(bp); + return; + } + splx(s); } /* * small block file system vnode pager input */ int vnode_pager_input_smlfs(vnp, m) vn_pager_t vnp; vm_page_t m; { int i; int s; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t foff; vm_offset_t kva; int fileaddr; int block; vm_offset_t bsize; int error = 0; paging_offset = m->object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; foff = m->offset + paging_offset; VOP_BMAP(vp, foff, &dp, 0, 0); kva = vm_pager_map_page(m); for (i = 0; i < PAGE_SIZE / bsize; i++) { /* * calculate logical block and offset */ block = foff / bsize + i; s = splbio(); while (bp = incore(vp, block)) { int amount; /* * wait until the buffer is avail or gone */ if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; tsleep((caddr_t) bp, PVM, "vnwblk", 0); continue; } amount = bsize; if ((foff + bsize) > vnp->vnp_size) amount = vnp->vnp_size - foff; /* * make sure that this page is in the buffer */ if ((amount > 0) && amount <= bp->b_bcount) { bp->b_flags |= B_BUSY; splx(s); /* * copy the data from the buffer */ bcopy(bp->b_un.b_addr, (caddr_t) kva + i * bsize, amount); if (amount < bsize) { bzero((caddr_t) kva + amount, bsize - amount); } bp->b_flags &= ~B_BUSY; wakeup((caddr_t) bp); goto nextblock; } break; } splx(s); fileaddr = vnode_pager_addr(vp, foff + i * bsize); if (fileaddr != -1) { bp = getpbuf(); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr / DEV_BSIZE; bgetvp(dp, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; /* do the input */ VOP_STRATEGY(bp); /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { tsleep((caddr_t) bp, PVM, "vnsrd", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); if (error) break; } else { bzero((caddr_t) kva + i * bsize, bsize); } nextblock: } vm_pager_unmap_page(kva); if (error) { return VM_PAGER_FAIL; } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags |= PG_CLEAN; m->flags &= ~PG_LAUNDRY; return VM_PAGER_OK; } /* * old style vnode pager output routine */ int vnode_pager_input_old(vnp, m) vn_pager_t vnp; vm_page_t m; { int i; struct uio auio; struct iovec aiov; int error; int size; vm_offset_t foff; vm_offset_t kva; error = 0; foff = m->offset + m->object->paging_offset; /* * Return failure if beyond current EOF */ if (foff >= vnp->vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (foff + size > vnp->vnp_size) size = vnp->vnp_size - foff; /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ kva = vm_pager_map_page(m); aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = foff; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_procp = (struct proc *) 0; error = VOP_READ(vnp->vnp_vp, &auio, 0, curproc->p_ucred); if (!error) { register int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t) kva + count, PAGE_SIZE - count); } vm_pager_unmap_page(kva); } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags |= PG_CLEAN; m->flags &= ~PG_LAUNDRY; return error ? VM_PAGER_FAIL : VM_PAGER_OK; } /* * generic vnode pager input routine */ int vnode_pager_input(vnp, m, count, reqpage) register vn_pager_t vnp; vm_page_t *m; int count, reqpage; { int i, j; vm_offset_t kva, foff; - int size; + int size, sizea; struct proc *p = curproc; /* XXX */ vm_object_t object; vm_offset_t paging_offset; struct vnode *dp, *vp; vm_offset_t mapsize; int bsize; int first, last; int reqaddr, firstaddr; int block, offset; int nbp; - struct buf *bp; + struct buf *bp, *bpa; + int counta; int s; int failflag; int errtype = 0; /* 0 is file type otherwise vm type */ int error = 0; object = m[reqpage]->object; /* all vm_page_t items are in same * object */ paging_offset = object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; /* get the UNDERLYING device for the file with VOP_BMAP() */ /* * originally, we did not check for an error return value -- assuming * an fs always has a bmap entry point -- that assumption is wrong!!! */ - kva = 0; mapsize = 0; foff = m[reqpage]->offset + paging_offset; - if (!VOP_BMAP(vp, foff, &dp, 0, 0)) { - /* - * we do not block for a kva, notice we default to a kva - * conservative behavior - */ - kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE)); - if (!kva) { - for (i = 0; i < count; i++) { - if (i != reqpage) { - vnode_pager_freepage(m[i]); - } - } - m[0] = m[reqpage]; - kva = kmem_alloc_wait(pager_map, mapsize = PAGE_SIZE); - reqpage = 0; - count = 1; - } - } - /* - * if we can't get a kva or we can't bmap, use old VOP code + * if we can't bmap, use old VOP code */ - if (!kva) { + if (VOP_BMAP(vp, foff, &dp, 0, 0)) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } return vnode_pager_input_old(vnp, m[reqpage]); /* * if the blocksize is smaller than a page size, then use * special small filesystem code. NFS sometimes has a small * blocksize, but it can handle large reads itself. */ } else if ((PAGE_SIZE / bsize) > 1 && (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { - kmem_free_wakeup(pager_map, kva, mapsize); - for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } return vnode_pager_input_smlfs(vnp, m[reqpage]); } /* * here on direct device I/O */ /* * This pathetic hack gets data from the buffer cache, if it's there. * I believe that this is not really necessary, and the ends can be * gotten by defaulting to the normal vfs read behavior, but this * might be more efficient, because the will NOT invoke read-aheads * and one of the purposes of this code is to bypass the buffer cache * and keep from flushing it by reading in a program. */ /* * calculate logical block and offset */ block = foff / bsize; offset = foff % bsize; s = splbio(); /* * if we have a buffer in core, then try to use it */ while (bp = incore(vp, block)) { int amount; /* * wait until the buffer is avail or gone */ if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; tsleep((caddr_t) bp, PVM, "vnwblk", 0); continue; } amount = PAGE_SIZE; if ((foff + amount) > vnp->vnp_size) amount = vnp->vnp_size - foff; /* * make sure that this page is in the buffer */ if ((amount > 0) && (offset + amount) <= bp->b_bcount) { bp->b_flags |= B_BUSY; splx(s); + kva = kmem_alloc_pageable( pager_map, PAGE_SIZE); /* * map the requested page */ - pmap_kenter(kva, VM_PAGE_TO_PHYS(m[reqpage])); - pmap_update(); + pmap_qenter(kva, &m[reqpage], 1); /* * copy the data from the buffer */ bcopy(bp->b_un.b_addr + offset, (caddr_t) kva, amount); if (amount < PAGE_SIZE) { bzero((caddr_t) kva + amount, PAGE_SIZE - amount); } /* * unmap the page and free the kva */ - pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE); + pmap_qremove( kva, 1); kmem_free_wakeup(pager_map, kva, mapsize); /* * release the buffer back to the block subsystem */ bp->b_flags &= ~B_BUSY; wakeup((caddr_t) bp); /* * we did not have to do any work to get the requested * page, the read behind/ahead does not justify a read */ for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } count = 1; reqpage = 0; m[0] = m[reqpage]; /* * sorry for the goto */ goto finishup; } /* * buffer is nowhere to be found, read from the disk */ break; } splx(s); reqaddr = vnode_pager_addr(vp, foff); s = splbio(); /* * Make sure that our I/O request is contiguous. Scan backward and * stop for the first discontiguous entry or stop for a page being in * buffer cache. */ failflag = 0; first = reqpage; for (i = reqpage - 1; i >= 0; --i) { if (failflag || incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || (vnode_pager_addr(vp, m[i]->offset + paging_offset)) != reqaddr + (i - reqpage) * PAGE_SIZE) { vnode_pager_freepage(m[i]); failflag = 1; } else { first = i; } } /* * Scan forward and stop for the first non-contiguous entry or stop * for a page being in buffer cache. */ failflag = 0; last = reqpage + 1; for (i = reqpage + 1; i < count; i++) { if (failflag || incore(vp, (foff + (i - reqpage) * PAGE_SIZE) / bsize) || (vnode_pager_addr(vp, m[i]->offset + paging_offset)) != reqaddr + (i - reqpage) * PAGE_SIZE) { vnode_pager_freepage(m[i]); failflag = 1; } else { last = i + 1; } } splx(s); /* * the first and last page have been calculated now, move input pages * to be zero based... */ count = last; if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; } count -= first; reqpage -= first; } /* * calculate the file virtual address for the transfer */ foff = m[0]->offset + paging_offset; /* * and get the disk physical address (in bytes) */ firstaddr = vnode_pager_addr(vp, foff); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > vnp->vnp_size) size = vnp->vnp_size - foff; /* * round up physical size for real devices */ if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + counta = 0; + if( count*PAGE_SIZE > bsize) + counta = (count - reqpage) - 1; + bpa = 0; + sizea = 0; + if( counta) { + bpa = getpbuf(); + count -= counta; + sizea = size - count*PAGE_SIZE; + size = count * PAGE_SIZE; + } + + bp = getpbuf(); + kva = (vm_offset_t)bp->b_data; + /* * and map the pages to be read into the kva */ - for (i = 0; i < count; i++) - pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); - - pmap_update(); - bp = getpbuf(); + pmap_qenter(kva, m, count); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); - bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = firstaddr / DEV_BSIZE; bgetvp(dp, bp); bp->b_bcount = size; bp->b_bufsize = size; /* do the input */ VOP_STRATEGY(bp); + if( counta) { + for(i=0;ib_data, &m[count], counta); + ++m[count]->object->paging_in_progress; + VHOLD(vp); + bpa->b_flags = B_BUSY | B_READ | B_CALL | B_ASYNC; + bpa->b_iodone = vnode_pager_iodone; + /* B_PHYS is not set, but it is nice to fill this in */ + bpa->b_proc = curproc; + bpa->b_rcred = bpa->b_wcred = bpa->b_proc->p_ucred; + if (bpa->b_rcred != NOCRED) + crhold(bpa->b_rcred); + if (bpa->b_wcred != NOCRED) + crhold(bpa->b_wcred); + bpa->b_blkno = (firstaddr + count * PAGE_SIZE) / DEV_BSIZE; + bgetvp(dp, bpa); + bpa->b_bcount = sizea; + bpa->b_bufsize = counta*PAGE_SIZE; + VOP_STRATEGY(bpa); + } + s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { tsleep((caddr_t) bp, PVM, "vnread", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; if (!error) { if (size != count * PAGE_SIZE) bzero((caddr_t) kva + size, PAGE_SIZE * count - size); } - pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); - kmem_free_wakeup(pager_map, kva, mapsize); + pmap_qremove( kva, count); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); finishup: for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->flags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; if (i != reqpage) { /* * whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere. (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * just in case someone was asking for this page we * now tell them that it is ok to use */ if (!error) { vm_page_deactivate(m[i]); PAGE_WAKEUP(m[i]); m[i]->flags &= ~PG_FAKE; } else { vnode_pager_freepage(m[i]); } } } if (error) { printf("vnode pager read error: %d\n", error); } if (errtype) return error; return (error ? VM_PAGER_FAIL : VM_PAGER_OK); } /* * old-style vnode pager output routine */ int vnode_pager_output_old(vnp, m) register vn_pager_t vnp; vm_page_t m; { vm_offset_t foff; vm_offset_t kva; vm_offset_t size; struct iovec aiov; struct uio auio; struct vnode *vp; int error; vp = vnp->vnp_vp; foff = m->offset + m->object->paging_offset; /* * Return failure if beyond current EOF */ if (foff >= vnp->vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (foff + size > vnp->vnp_size) size = vnp->vnp_size - foff; /* * Allocate a kernel virtual address and initialize so that * we can use VOP_WRITE routines. */ kva = vm_pager_map_page(m); aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = foff; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_resid = size; auio.uio_procp = (struct proc *) 0; error = VOP_WRITE(vp, &auio, 0, curproc->p_ucred); if (!error) { if ((size - auio.uio_resid) == 0) { error = EINVAL; } } vm_pager_unmap_page(kva); return error ? VM_PAGER_FAIL : VM_PAGER_OK; } } /* * vnode pager output on a small-block file system */ int vnode_pager_output_smlfs(vnp, m) vn_pager_t vnp; vm_page_t m; { int i; int s; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t foff; vm_offset_t kva; int fileaddr; int block; vm_offset_t bsize; int error = 0; paging_offset = m->object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; foff = m->offset + paging_offset; VOP_BMAP(vp, foff, &dp, 0, 0); kva = vm_pager_map_page(m); for (i = 0; !error && i < (PAGE_SIZE / bsize); i++) { /* * calculate logical block and offset */ fileaddr = vnode_pager_addr(vp, foff + i * bsize); if (fileaddr != -1) { s = splbio(); if (bp = incore(vp, (foff / bsize) + i)) { bp = getblk(vp, (foff / bsize) + i, bp->b_bufsize, 0, 0); bp->b_flags |= B_INVAL; brelse(bp); } splx(s); bp = getpbuf(); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_CALL | B_WRITE; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr / DEV_BSIZE; bgetvp(dp, bp); ++dp->v_numoutput; /* for NFS */ bp->b_dirtyoff = 0; bp->b_dirtyend = bsize; bp->b_bcount = bsize; bp->b_bufsize = bsize; /* do the input */ VOP_STRATEGY(bp); /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { tsleep((caddr_t) bp, PVM, "vnswrt", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); } } vm_pager_unmap_page(kva); if (error) return VM_PAGER_FAIL; else return VM_PAGER_OK; } /* * generic vnode pager output routine */ int vnode_pager_output(vnp, m, count, rtvals) vn_pager_t vnp; vm_page_t *m; int count; int *rtvals; { int i, j; vm_offset_t kva, foff; int size; struct proc *p = curproc; /* XXX */ vm_object_t object; vm_offset_t paging_offset; struct vnode *dp, *vp; struct buf *bp; vm_offset_t mapsize; vm_offset_t reqaddr; int bsize; int s; int error = 0; retryoutput: object = m[0]->object; /* all vm_page_t items are in same object */ paging_offset = object->paging_offset; vp = vnp->vnp_vp; bsize = vp->v_mount->mnt_stat.f_iosize; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; /* * if the filesystem does not have a bmap, then use the old code */ if (VOP_BMAP(vp, m[0]->offset + paging_offset, &dp, 0, 0)) { rtvals[0] = vnode_pager_output_old(vnp, m[0]); pmap_clear_modify(VM_PAGE_TO_PHYS(m[0])); m[0]->flags |= PG_CLEAN; m[0]->flags &= ~PG_LAUNDRY; return rtvals[0]; } /* * if the filesystem has a small blocksize, then use the small block * filesystem output code */ if ((bsize < PAGE_SIZE) && (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { for (i = 0; i < count; i++) { rtvals[i] = vnode_pager_output_smlfs(vnp, m[i]); if (rtvals[i] == VM_PAGER_OK) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->flags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; } } return rtvals[0]; } - /* - * get some kva for the output - */ - kva = kmem_alloc_pageable(pager_map, (mapsize = count * PAGE_SIZE)); - if (!kva) { - kva = kmem_alloc_pageable(pager_map, (mapsize = PAGE_SIZE)); - count = 1; - if (!kva) - return rtvals[0]; - } for (i = 0; i < count; i++) { foff = m[i]->offset + paging_offset; if (foff >= vnp->vnp_size) { for (j = i; j < count; j++) rtvals[j] = VM_PAGER_BAD; count = i; break; } } if (count == 0) { return rtvals[0]; } foff = m[0]->offset + paging_offset; reqaddr = vnode_pager_addr(vp, foff); /* * Scan forward and stop for the first non-contiguous entry or stop * for a page being in buffer cache. */ for (i = 1; i < count; i++) { if (vnode_pager_addr(vp, m[i]->offset + paging_offset) != reqaddr + i * PAGE_SIZE) { count = i; break; } } /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > vnp->vnp_size) size = vnp->vnp_size - foff; /* * round up physical size for real devices */ if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + bp = getpbuf(); + kva = (vm_offset_t)bp->b_data; /* * and map the pages to be read into the kva */ - for (i = 0; i < count; i++) - pmap_kenter(kva + PAGE_SIZE * i, VM_PAGE_TO_PHYS(m[i])); - pmap_update(); -/* + pmap_qenter(kva, m, count); printf("vnode: writing foff: %d, devoff: %d, size: %d\n", foff, reqaddr, size); -*/ /* * next invalidate the incore vfs_bio data */ for (i = 0; i < count; i++) { int filblock = (foff + i * PAGE_SIZE) / bsize; struct buf *fbp; s = splbio(); if (fbp = incore(vp, filblock)) { fbp = getblk(vp, filblock, fbp->b_bufsize, 0, 0); if (fbp->b_flags & B_DELWRI) { if (fbp->b_bufsize <= PAGE_SIZE) fbp->b_flags &= ~B_DELWRI; else { bwrite(fbp); fbp = getblk(vp, filblock, fbp->b_bufsize, 0, 0); } } fbp->b_flags |= B_INVAL; brelse(fbp); } splx(s); } - bp = getpbuf(); VHOLD(vp); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_WRITE | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); - bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr / DEV_BSIZE; bgetvp(dp, bp); ++dp->v_numoutput; /* for NFS */ bp->b_dirtyoff = 0; bp->b_dirtyend = size; bp->b_bcount = size; bp->b_bufsize = size; /* do the output */ VOP_STRATEGY(bp); s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { tsleep((caddr_t) bp, PVM, "vnwrite", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; - pmap_remove(vm_map_pmap(pager_map), kva, kva + PAGE_SIZE * count); - kmem_free_wakeup(pager_map, kva, mapsize); + pmap_qremove( kva, count); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); HOLDRELE(vp); if (!error) { for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->flags |= PG_CLEAN; m[i]->flags &= ~PG_LAUNDRY; rtvals[i] = VM_PAGER_OK; } } else if (count != 1) { error = 0; count = 1; goto retryoutput; } if (error) { printf("vnode pager write error: %d\n", error); } return (error ? VM_PAGER_FAIL : VM_PAGER_OK); }