Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c (revision 13489) +++ head/sys/amd64/amd64/machdep.c (revision 13490) @@ -1,1820 +1,1820 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.168 1996/01/04 21:10:53 wollman Exp $ + * $Id: machdep.c,v 1.169 1996/01/05 20:12:19 wollman Exp $ */ #include "npx.h" #include "isa.h" #include "opt_sysvipc.h" #include "opt_ddb.h" #include "opt_bounce.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void init386 __P((int first)); extern int ptrace_set_pc __P((struct proc *p, unsigned int addr)); extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); extern void i486_bzero __P((void *, size_t)); extern void i586_bzero __P((void *, size_t)); extern void i686_bzero __P((void *, size_t)); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static void identifycpu(void); char machine[] = "i386"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static char cpu_model[128]; SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, ""); struct kern_devconf kdc_cpu0 = { 0, 0, 0, /* filled in by dev_attach */ "cpu", 0, { MDDT_CPU }, 0, 0, 0, CPU_EXTERNALLEN, 0, /* CPU has no parent */ 0, /* no parentdata */ DC_BUSY, /* the CPU is always busy */ cpu_model, /* no sense in duplication */ DC_CLS_CPU /* class */ }; #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif #ifdef BOUNCE_BUFFERS extern char *bouncememory; extern int maxbkva; #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif #endif /* BOUNCE_BUFFERS */ extern int freebufspace; int msgbufmapped = 0; /* set when safe to use msgbuf */ int _udatasel, _ucodesel; int physmem = 0; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); int boothowto = 0, bootverbose = 0, Maxmem = 0; static int badpages = 0; long dumplo; extern int bootdev; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) int cpu_class; static void dumpsys __P((void)); static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */ static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; extern struct linker_set netisr_set; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Initialize error message buffer (at end of core). */ /* avail_end was pre-decremented in init_386() to compensate */ for (i = 0; i < btoc(sizeof (struct msgbuf)); i++) pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp, avail_end + i * NBPG, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* * Good {morning,afternoon,evening,night}. */ printf(version); startrtclock(); identifycpu(); printf("real memory = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (badpages != 0) { int indx = 1; /* * XXX skip reporting ISA hole & unmanaged kernel memory */ if (phys_avail[0] == PAGE_SIZE) indx += 2; printf("Physical memory hole(s):\n"); for (; phys_avail[indx + 1] != 0; indx += 2) { int size = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size, size / PAGE_SIZE); } } /* * Quickly wire in netisrs. */ setup_netisrs(&netisr_set); /* #ifdef ISDN DONET(isdnintr, NETISR_ISDN); #endif */ /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 12, 1024); } nswbuf = min(nbuf, 128); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifdef BOUNCE_BUFFERS /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) { bouncepages = 64; bouncepages += ((Maxmem - 4096) / 2048) * 32; } v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); #ifdef BOUNCE_BUFFERS clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + maxbkva + pager_map_size, TRUE); io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); #else clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE); #endif buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*MAXBSIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size, TRUE); exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*ARG_MAX), TRUE); u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (maxproc*UPAGES*PAGE_SIZE), FALSE); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ mclrefcnt = (char *)malloc(nmbclusters+CLBYTES/MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, nmbclusters+CLBYTES/MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, nmbclusters * MCLBYTES, FALSE); /* * Initialize callouts */ callfree = callout; for (i = 1; i < ncallout; i++) callout[i-1].c_next = &callout[i]; if (boothowto & RB_CONFIG) { userconfig(); cninit(); /* the preferred console may have changed */ } #ifdef BOUNCE_BUFFERS /* * init bounce buffers */ vm_bounce_init(); #endif /* * XXX allocate a contiguous area for ISA (non busmaster) DMA * operations. This _should_ only be done if the DMA channels * will actually be used, but for now we do it always. */ #define DMAPAGES 8 isaphysmem = vm_page_alloc_contig(DMAPAGES * PAGE_SIZE, 0, 0xfffffful, 64*1024); printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); /* * In verbose mode, print out the BIOS's idea of the disk geometries. */ if (bootverbose) { printf("BIOS Geometries:\n"); for (i = 0; i < N_BIOS_GEOM; i++) { unsigned long bios_geom; int max_cylinder, max_head, max_sector; bios_geom = bootinfo.bi_bios_geom[i]; /* * XXX the bootstrap punts a 1200K floppy geometry * when the get-disk-geometry interrupt fails. Skip * drives that have this geometry. */ if (bios_geom == 0x4f010f) continue; printf(" %x:%08lx ", i, bios_geom); max_cylinder = bios_geom >> 16; max_head = (bios_geom >> 8) & 0xff; max_sector = bios_geom & 0xff; printf( "0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n", max_cylinder, max_cylinder + 1, max_head, max_head + 1, max_sector, max_sector); } printf(" %d accounted for\n", bootinfo.bi_n_bios_used); } } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } static void setup_netisrs(ls) struct linker_set *ls; { int i; const struct netisrtab *nit; for(i = 0; ls->ls_items[i]; i++) { nit = (const struct netisrtab *)ls->ls_items[i]; register_netisr(nit->nit_num, nit->nit_isr); } } static struct cpu_nameclass i386_cpus[] = { { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ { "i486SX", CPUCLASS_486 }, /* CPU_486SX */ { "i486DX", CPUCLASS_486 }, /* CPU_486 */ { "Pentium", CPUCLASS_586 }, /* CPU_586 */ { "Cy486DLC", CPUCLASS_486 }, /* CPU_486DLC */ { "Pentium Pro", CPUCLASS_686 }, /* CPU_686 */ }; static void identifycpu() { printf("CPU: "); if (cpu >= 0 && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) { cpu_class = i386_cpus[cpu].cpu_class; strncpy(cpu_model, i386_cpus[cpu].cpu_name, sizeof cpu_model); } else { printf("unknown cpu type %d\n", cpu); panic("startup: bad cpu id"); } #if defined(I586_CPU) || defined(I686_CPU) if (cpu_class == CPUCLASS_586 || cpu_class == CPUCLASS_686) { calibrate_cyclecounter(); } #endif #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) if (!strcmp(cpu_vendor,"GenuineIntel")) { if ((cpu_id & 0xf00) > 3) { cpu_model[0] = '\0'; switch (cpu_id & 0x3000) { case 0x1000: strcpy(cpu_model, "Overdrive "); break; case 0x2000: strcpy(cpu_model, "Dual "); break; } switch (cpu_id & 0xf00) { case 0x400: strcat(cpu_model, "i486 "); break; case 0x500: strcat(cpu_model, "Pentium"); /* nb no space */ break; case 0x600: strcat(cpu_model, "Pentium Pro"); break; default: strcat(cpu_model, "unknown"); break; } switch (cpu_id & 0xff0) { case 0x400: strcat(cpu_model, "DX"); break; case 0x410: strcat(cpu_model, "DX"); break; case 0x420: strcat(cpu_model, "SX"); break; case 0x430: strcat(cpu_model, "DX2"); break; case 0x440: strcat(cpu_model, "SL"); break; case 0x450: strcat(cpu_model, "SX2"); break; case 0x470: strcat(cpu_model, "DX2 Write-Back Enhanced"); break; case 0x480: strcat(cpu_model, "DX4"); break; break; } } } #endif printf("%s (", cpu_model); switch(cpu_class) { case CPUCLASS_286: printf("286"); break; #if defined(I386_CPU) case CPUCLASS_386: printf("386"); break; #endif #if defined(I486_CPU) case CPUCLASS_486: printf("486"); bzero = i486_bzero; break; #endif #if defined(I586_CPU) case CPUCLASS_586: printf("%d.%02d-MHz ", ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("586"); bzero = i586_bzero; break; #endif #if defined(I686_CPU) case CPUCLASS_686: printf("%d.%02d-MHz ", ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("686"); bzero = i686_bzero; break; #endif default: printf("unknown"); /* will panic below... */ } printf("-class CPU)\n"); #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) if(*cpu_vendor) printf(" Origin = \"%s\"",cpu_vendor); if(cpu_id) printf(" Id = 0x%lx",cpu_id); if (!strcmp(cpu_vendor, "GenuineIntel")) { printf(" Stepping=%ld", cpu_id & 0xf); if (cpu_high > 0) { #define FEATUREFMT "\020\001FPU\002VME\003PSE\004MCE\005CX8\006APIC" printf("\n Features=0x%b", cpu_feature, FEATUREFMT); } } /* Avoid ugly blank lines: only print newline when we have to. */ if (*cpu_vendor || cpu_id) printf("\n"); #endif /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { case CPUCLASS_286: /* a 286 should not make it this far, anyway */ #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU) #error This kernel is not configured for one of the supported CPUs #endif #if !defined(I386_CPU) case CPUCLASS_386: #endif #if !defined(I486_CPU) case CPUCLASS_486: #endif #if !defined(I586_CPU) case CPUCLASS_586: #endif #if !defined(I686_CPU) case CPUCLASS_686: #endif panic("CPU class not configured"); default: break; } dev_attach(&kdc_cpu0); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; unsigned code; { register struct proc *p = curproc; register int *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((psp->ps_flags & SAS_ALTSTACK) && (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SA_ONSTACK; } else { fp = (struct sigframe *)(regs[tESP] - sizeof(struct sigframe)); } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs[tERR]; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs[tEAX]; sf.sf_sc.sc_ebx = regs[tEBX]; sf.sf_sc.sc_ecx = regs[tECX]; sf.sf_sc.sc_edx = regs[tEDX]; sf.sf_sc.sc_esi = regs[tESI]; sf.sf_sc.sc_edi = regs[tEDI]; sf.sf_sc.sc_cs = regs[tCS]; sf.sf_sc.sc_ds = regs[tDS]; sf.sf_sc.sc_ss = regs[tSS]; sf.sf_sc.sc_es = regs[tES]; sf.sf_sc.sc_isp = regs[tISP]; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs[tESP]; sf.sf_sc.sc_fp = regs[tEBP]; sf.sf_sc.sc_pc = regs[tEIP]; sf.sf_sc.sc_ps = regs[tEFLAGS]; /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); }; regs[tESP] = (int)fp; regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc; regs[tEFLAGS] &= ~PSL_VM; regs[tCS] = _ucodesel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tSS] = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap, retval) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; int *retval; { register struct sigcontext *scp; register struct sigframe *fp; register int *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs[tESP] points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0) return(EINVAL); /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = scp->sc_ps; /* * XXX do allow users to change the privileged flag PSL_RF. The * cpu sets PSL_RF in tf_eflags for faults. Debuggers should * sometimes set it there too. tf_eflags is kept in the signal * context during signal handling and there is no other place * to remember it, so the PSL_RF bit may be corrupted by the * signal handler without us knowing. Corruption of the PSL_RF * bit at worst causes one more or one less debugger trap, so * allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs[tEFLAGS] & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } /* restore scratch registers */ regs[tEAX] = scp->sc_eax; regs[tEBX] = scp->sc_ebx; regs[tECX] = scp->sc_ecx; regs[tEDX] = scp->sc_edx; regs[tESI] = scp->sc_esi; regs[tEDI] = scp->sc_edi; regs[tCS] = scp->sc_cs; regs[tDS] = scp->sc_ds; regs[tES] = scp->sc_es; regs[tSS] = scp->sc_ss; regs[tISP] = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; p->p_sigmask = scp->sc_mask &~ (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP)); regs[tEBP] = scp->sc_fp; regs[tESP] = scp->sc_sp; regs[tEIP] = scp->sc_pc; regs[tEFLAGS] = eflags; return(EJUSTRETURN); } static int waittime = -1; static struct pcb dumppcb; __dead void boot(howto) int howto; { if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { register struct buf *bp; int iter, nbusy; waittime = 0; printf("\nsyncing disks... "); sync(&proc0, NULL, NULL); for (iter = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { nbusy++; } } if (nbusy == 0) break; printf("%d ", nbusy); DELAY(40000 * iter); } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("giving up\n"); #ifdef SHOW_BUSYBUFS nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { nbusy++; printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno); } } DELAY(5000000); /* 5 seconds */ #endif } else { printf("done\n"); /* * Unmount filesystems */ if (panicstr == 0) vfs_unmountall(); } DELAY(100000); /* wait for console output to finish */ dev_shutdownall(FALSE); } splhigh(); if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cngetc(); } else { if (howto & RB_DUMP) { if (!cold) { savectx(&dumppcb, 0); dumppcb.pcb_ptd = rcr3(); dumpsys(); } if (PANIC_REBOOT_WAIT_TIME != 0) { if (PANIC_REBOOT_WAIT_TIME != -1) { int loop; printf("Automatic reboot in %d seconds - press a key on the console to abort\n", PANIC_REBOOT_WAIT_TIME); for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ if (cncheckc()) /* Did user type a key? */ break; } if (!loop) goto die; } } else { /* zero time specified - reboot NOW */ goto die; } printf("--> Press a key on the console to reboot <--\n"); cngetc(); } } die: printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ cpu_reset(); for(;;) ; /* NOTREACHED */ } /* * Magic number for savecore * * exported (symorder) and used at least by savecore(8) * */ u_long dumpmag = 0x8fca0101UL; static int dumpsize = 0; /* also for savecore */ static int dodump = 1; SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, ""); /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ static void dumpsys() { if (!dodump) return; if (dumpdev == NODEV) return; if ((minor(dumpdev)&07) != 1) return; if (!(bdevsw[major(dumpdev)])) return; if (!(bdevsw[major(dumpdev)]->d_dump)) return; dumpsize = Maxmem; printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo); printf("dump "); switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; default: printf("succeeded\n"); break; } } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { int *regs = p->p_md.md_regs; bzero(regs, sizeof(struct trapframe)); regs[tEIP] = entry; regs[tESP] = stack; regs[tEFLAGS] = PSL_USER | (regs[tEFLAGS] & PSL_T); regs[tSS] = _udatasel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tCS] = _ucodesel; p->p_addr->u_pcb.pcb_flags = 0; /* no fp at all */ load_cr0(rcr0() | CR0_TS); /* start emulating */ #if NNPX > 0 npxinit(__INITIAL_NPXCW__); #endif /* NNPX > 0 */ } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int currentldt; int _default_ldt; union descriptor gdt[NGDT]; /* global descriptor table */ struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 3 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 4 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 5 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ { (int) kstack, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 7 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 10 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall); #if defined(COMPAT_LINUX) || defined(LINUX) extern inthand_t IDTVEC(linux_syscall); #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void init386(first) int first; { int x; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; int gsel_tss; /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; int pagesinbase, pagesinext; int target_page, pa_indx; proc0.p_addr = proc0paddr; /* * Initialize the console before we print anything out. */ cninit(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was - * i386_btop(i386_round_page(etext)) - 1. + * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * NBPG) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; /* Note. eventually want private ldts per process */ for (x = 0; x < NLDT; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(COMPAT_LINUX) || defined(LINUX) setidt(0x80, &IDTVEC(linux_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * Print a warning if the official BIOS interface disagrees * with the hackish interface used above. Eventually only * the official interface should be used. */ if (bootinfo.bi_memsizes_valid) { if (bootinfo.bi_basemem != biosbasemem) printf("BIOS basemem (%ldK) != RTC basemem (%dK)\n", bootinfo.bi_basemem, biosbasemem); if (bootinfo.bi_extmem != biosextmem) printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n", bootinfo.bi_extmem, biosextmem); } /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) biosbasemem = 640; /* * Some 386 machines might give us a bogus number for extended * mem. If this happens, stop now. */ #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); /* NOTREACHED */ } #endif pagesinbase = biosbasemem * 1024 / NBPG; pagesinext = biosextmem * 1024 / NBPG; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of of the physical address space. It */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; #ifdef MAXMEM Maxmem = MAXMEM/4; #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Size up each available chunk of physical memory. */ /* * We currently don't bother testing base memory. * XXX ...but we probably should. */ pa_indx = 0; badpages = 0; if (pagesinbase > 1) { phys_avail[pa_indx++] = PAGE_SIZE; /* skip first page of memory */ phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */ physmem = pagesinbase - 1; } else { /* point at first chunk end */ pa_indx++; } for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) { int tmp, page_bad = FALSE; /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_KW | PG_N | target_page; pmap_update(); tmp = *(int *)CADDR1; /* * Test for alternating 1's and 0's */ *(volatile int *)CADDR1 = 0xaaaaaaaa; if (*(volatile int *)CADDR1 != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)CADDR1 = 0x55555555; if (*(volatile int *)CADDR1 != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)CADDR1 = 0xffffffff; if (*(volatile int *)CADDR1 != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)CADDR1 = 0x0; if (*(volatile int *)CADDR1 != 0x0) { /* * test of page failed */ page_bad = TRUE; } /* * Restore original value. */ *(int *)CADDR1 = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == FALSE) { /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. */ if (phys_avail[pa_indx] == target_page) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = target_page; /* start */ phys_avail[pa_indx] = target_page + PAGE_SIZE; /* end */ } physmem++; } else { badpages++; page_bad = FALSE; } } *(int *)CMAP1 = 0; pmap_update(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf)); avail_end = phys_avail[pa_indx]; /* now running on new page tables, configured,and u/iom is accessible */ /* make a initial tss so microp can get interrupt stack on syscall! */ proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG; proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); ((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt = (sizeof(struct i386tss))<<16; ltr(gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode); proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_ptd = IdlePTD; } /* * The registers are in the frame; the frame is in the user area of * the process in question; when the process is active, the registers * are in "the kernel stack"; when it's not, they're still there, but * things get flipped around. So, since p->p_md.md_regs is the whole address * of the register set, take its offset from the kernel stack, and * index into the user block. Don't you just *love* virtual memory? * (I'm starting to think seymour is right...) */ #define TF_REGP(p) ((struct trapframe *) \ ((char *)(p)->p_addr \ + ((char *)(p)->p_md.md_regs - kstack))) int ptrace_set_pc(p, addr) struct proc *p; unsigned int addr; { TF_REGP(p)->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { TF_REGP(p)->tf_eflags |= PSL_T; return (0); } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; int data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - kstack; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = TF_REGP(p); frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct trapframe *tp; tp = TF_REGP(p); regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct trapframe *tp; tp = TF_REGP(p); if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include #define b_cylin b_resid /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } /* calculate cylinder for disksort to order transfers with */ bp->b_pblkno = bp->b_blkno + p->p_offset; bp->b_cylin = bp->b_pblkno / lp->d_secpercyl; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } int disk_externalize(int drive, struct sysctl_req *req) { return SYSCTL_OUT(req, &drive, sizeof drive); } Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 13489) +++ head/sys/amd64/amd64/pmap.c (revision 13490) @@ -1,1954 +1,2167 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.71 1995/12/17 07:19:15 bde Exp $ + * $Id: pmap.c,v 1.72 1995/12/22 18:21:26 bde Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#define PMAP_KEEP_PDIRS + +static void init_pv_entries __P((int)); + /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int nkpt; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1; static pt_entry_t *CMAP2, *ptmmap; static pv_entry_t pv_table; caddr_t CADDR1, ptvmmap; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp; static void free_pv_entry __P((pv_entry_t pv)); static pt_entry_t * get_pt_entry __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); -static void init_pv_entries __P((int npg)); static void pmap_alloc_pv_entry __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static void pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa)); static int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static void pmap_remove_entry __P((struct pmap *pmap, pv_entry_t pv, vm_offset_t va)); static vm_page_t pmap_pte_vm_page __P((pmap_t pmap, vm_offset_t pt)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); /* + * The below are finer grained pmap_update routines. These eliminate + * the gratuitious tlb flushes on non-i386 architectures. + */ +static __inline void +pmap_update_1pg( vm_offset_t va) { +#if defined(I386_CPU) + if (cpuclass == CPUCLASS_I386) + pmap_update(); + else +#endif + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va)); +} + +static __inline void +pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) { +#if defined(I386_CPU) + if (cpuclass == CPUCLASS_I386) { + pmap_update(); + } else +#endif + { + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1)); + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2)); + } +} + +/* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ -inline pt_entry_t * __pure +__inline pt_entry_t * __pure pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return ((pt_entry_t *) avtopte(va)); } } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t pa; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } pa = *(int *) avtopte(va); } return ((pa & PG_FRAME) | (va & ~PG_FRAME)); } return 0; } /* * determine if a page is managed (memory vs. device) */ -static inline int +static __inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ static __inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { vm_page_t m; - pt = i386_trunc_page(pt); - pt = (pt - UPT_MIN_ADDRESS) / NBPG; + pt = trunc_page(pt); + pt = (pt - UPT_MIN_ADDRESS) / PAGE_SIZE; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; m = PHYS_TO_VM_PAGE(pt); return m; } /* * Wire a page table page */ __inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold(pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ -inline void +__inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; vm_page_t m; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); m = pmap_pte_vm_page(pmap, pt); vm_page_unhold(m); if (pmap != kernel_pmap && (m->hold_count == 0) && (m->wire_count == 0) && (va < KPT_MIN_ADDRESS)) { +/* + * We don't free page-table-pages anymore because it can have a negative + * impact on perf at times. Now we just deactivate, and it'll get cleaned + * up if needed... Also, if the page ends up getting used, it will fault + * back into the process address space and be reactivated. + */ +#ifdef PMAP_FREE_OLD_PTES pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); +#else + m->dirty = 0; + vm_page_deactivate(m); +#endif } } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; avail_start = firstaddr; /* - * XXX The calculation of virtual_avail is wrong. It's NKPT*NBPG too + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); kernel_pmap->pm_count = 1; nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufmap is used to map the system message buffer. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); - npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; + npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); - s = i386_round_page(s); + s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } +#ifdef PMAP_KEEP_PDIRS +int nfreepdir; +caddr_t *pdirlist; +#define NFREEPDIR 3 + +static void * +pmap_getpdir() { + caddr_t *pdir; + if (pdirlist) { + --nfreepdir; + pdir = pdirlist; + pdirlist = (caddr_t *) *pdir; + bzero( (caddr_t) pdir, PAGE_SIZE); + } else { + pdir = (caddr_t *) kmem_alloc(kernel_map, PAGE_SIZE); + } + + return (void *) pdir; +} + +static void +pmap_freepdir(void *pdir) { + if (nfreepdir > NFREEPDIR) { + kmem_free(kernel_map, (vm_offset_t) pdir, PAGE_SIZE); + } else { + * (caddr_t *) pdir = (caddr_t) pdirlist; + pdirlist = (caddr_t *) pdir; + ++nfreepdir; + } +} +#endif + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a valid * page directory table. */ + +#ifdef PMAP_KEEP_PDIRS + pmap->pm_pdir = pmap_getpdir(); +#else pmap->pm_pdir = (pd_entry_t *) kmem_alloc(kernel_map, PAGE_SIZE); +#endif /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(int *) (pmap->pm_pdir + PTDPTDI) = ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; } /* * grow the number of kernel page table entries, if needed */ static vm_page_t nkpg; vm_offset_t kernel_vm_end; void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } - addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW); nkpg = NULL; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { +#ifdef PMAP_KEEP_PDIRS + pmap_freepdir( (void *)pmap->pm_pdir); +#else kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE); +#endif } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } -#define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) +#define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ static int pv_freelistcnt; static pv_entry_t pv_freelist; static vm_offset_t pvva; static int npvvapg; /* * free the pv_entry back to the free list */ -inline static void +static __inline void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ -static inline pv_entry_t +static __inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, OFF_TO_IDX(pvva - vm_map_min(kernel_map)), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; - newentries = (NBPG / sizeof(struct pv_entry)); + newentries = (PAGE_SIZE / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ - pvva += NBPG; + pvva += PAGE_SIZE; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ - npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG; - pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); + npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + + PAGE_SIZE - 1) / PAGE_SIZE; + pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { return PTmap; } /* otherwise, we are alternate address space */ if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return APTmap; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { - for (npv = pv->pv_next; npv; npv = npv->pv_next) { + for (npv = pv->pv_next; npv; (pv = npv, npv = pv->pv_next)) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } - pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp, *ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ - if ((sva + NBPG) == eva) { + if ((sva + PAGE_SIZE) == eva) { if (*pmap_pde(pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if (!*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & PG_M) { - if (sva < USRSTACK + (UPAGES * NBPG) || + if (sva < USRSTACK + (UPAGES * PAGE_SIZE) || (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); } pmap_unuse_pt(pmap, sva); - pmap_update(); + pmap_update_1pg(sva); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (*pmap_pde(pmap, i386_ptob(sva)) == 0) { /* We can race ahead here, straight to next pde.. */ sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations that * are much faster than explicitly scanning when page tables * are not fully populated. */ if (*ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx"); if (!found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int) oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. XXX: should cluster them up and * invalidate as many as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we may be called * at interrupt time). */ pa = ((int) oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { - pmap_unuse_pt(pmap, va); + pmap_unuse_pt(pmap, (vm_offset_t) va); ++sva; continue; } if ((int) oldpte & PG_M) { - if (sva < USRSTACK + (UPAGES * NBPG) || + if (sva < USRSTACK + (UPAGES * PAGE_SIZE) || (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } pmap_update(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { - register pv_entry_t pv, npv; + register pv_entry_t pv, opv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) return; - pa = i386_trunc_page(pa); - pv = pa_to_pvh(pa); - m = PHYS_TO_VM_PAGE(pa); + pa = trunc_page(pa); + opv = pa_to_pvh(pa); + if (opv->pv_pmap == NULL) + return; + m = PHYS_TO_VM_PAGE(pa); s = splhigh(); - while (pv->pv_pmap != NULL) { - pmap = pv->pv_pmap; + pv = opv; + while (pv && ((pmap = pv->pv_pmap) != NULL)) { ptp = get_pt_entry(pmap); va = pv->pv_va; pte = ptp + i386_btop(va); if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if (*pte) { pmap->pm_stats.resident_count--; - anyvalid++; + if (curproc != pageproc) + anyvalid++; /* * Update the vm_page_t clean and reference bits. */ if ((int) *pte & PG_M) { - if (va < USRSTACK + (UPAGES * NBPG) || + if (va < USRSTACK + (UPAGES * PAGE_SIZE) || (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } *pte = 0; pmap_unuse_pt(pmap, va); } + pv = pv->pv_next; + } + + for (pv = opv->pv_next; pv; pv = npv) { npv = pv->pv_next; - if (npv) { - *pv = *npv; - free_pv_entry(npv); - } else { - pv->pv_pmap = NULL; - } + free_pv_entry(pv); } + + opv->pv_pmap = NULL; + opv->pv_next = NULL; + splx(s); if (anyvalid) pmap_update(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found = 0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. Skip it, we don't want to * force allocation of unnecessary PTE pages just to set the * protection. */ if (!*pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t) - 1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if (*pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx"); if (!found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if (va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) pmap_update(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int ptevalid = 0; if (pmap == NULL) return; - va = i386_trunc_page(va); - pa = i386_trunc_page(pa); + va = trunc_page(va); + pa = trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if (*pmap_pde(pmap, va) == 0) { printf("kernel page directory invalid pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); panic("invalid kernel page directory"); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place * this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V)); /* * When forking (copy-on-write, etc): A process will turn off write * permissions for any of its writable pages. If the data (object) is * only referred to by one process, the processes map is modified * directly as opposed to using the object manipulation routine. When * using pmap_protect, the modified bits are not kept in the vm_page_t * data structure. Therefore, when using pmap_enter in vm_fault to * bring back writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. this clause * supports the carryover of the modified and used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M | PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if (*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) { - pmap_update(); + pmap_update_1pg(va); } else { pmap_use_pt(pmap, va); } } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for (i = 0; i < count; i++) { - pte = vtopte(va + i * NBPG); - if (*pte) - anyvalid++; - *pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V)); + vm_offset_t tva = va + i * PAGE_SIZE; + pt_entry_t npte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V)); + pte = vtopte(tva); + if (*pte && (*pte != npte)) + pmap_update_1pg(tva); + *pte = npte; } - if (anyvalid) - pmap_update(); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { - pte = vtopte(va + i * NBPG); + vm_offset_t tva = va + i * PAGE_SIZE; + pte = vtopte(tva); *pte = 0; + pmap_update_1pg(tva); } - pmap_update(); } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; int wasvalid = 0; pte = vtopte(va); if (*pte) wasvalid++; *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V)); if (wasvalid) - pmap_update(); + pmap_update_1pg(va); } /* * remove a page from the kernel pagetables */ void pmap_kremove(va) vm_offset_t va; { register pt_entry_t *pte; pte = vtopte(va); *pte = (pt_entry_t) 0; - pmap_update(); + pmap_update_1pg(va); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ -static inline void +static __inline void pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pte = vtopte(va); /* a fault on the page table might occur here */ if (*pte) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_pmap = pmap; pv->pv_va = va; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place this entry * after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ((int) (pa | PG_V | PG_u)); pmap_use_pt(pmap, va); return; } -#define MAX_INIT_PT (512 * 4096) +#define MAX_INIT_PT (512) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; { vm_offset_t tmpidx; int psize; vm_page_t p; int objpgs; - if (!pmap || ((size > MAX_INIT_PT) && - (object->resident_page_count > MAX_INIT_PT / PAGE_SIZE))) { + psize = (size >> PAGE_SHIFT); + + if (!pmap || ((psize > MAX_INIT_PT) && + (object->resident_page_count > MAX_INIT_PT))) { return; } - psize = (size >> PAGE_SHIFT); /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = object->memq.tqh_first; ((objpgs > 0) && (p != NULL)); p = p->listq.tqe_next) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } - if (((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) && - ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && - (p->bmapped == 0) && + if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { - if (p->flags & PG_CACHE) + if (p->queue == PQ_CACHE) vm_page_deactivate(p); vm_page_hold(p); p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + (tmpidx << PAGE_SHIFT), VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); - if (p && - ((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) && - (p->bmapped == 0) && - (p->busy == 0) && + if (p && (p->busy == 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { - if (p->flags & PG_CACHE) + if (p->queue == PQ_CACHE) vm_page_deactivate(p); vm_page_hold(p); p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + (tmpidx << PAGE_SHIFT), VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } } } } /* + * pmap_prefault provides a quick way of clustering + * pagefaults into a processes address space. It is a "cousin" + * of pmap_object_init_pt, except it runs at page fault time instead + * of mmap time. + */ +#define PFBAK 2 +#define PFFOR 2 +#define PAGEORDER_SIZE (PFBAK+PFFOR) + +static int pmap_prefault_pageorder[] = { + -NBPG, NBPG, -2 * NBPG, 2 * NBPG +}; + +void +pmap_prefault(pmap, addra, entry, object) + pmap_t pmap; + vm_offset_t addra; + vm_map_entry_t entry; + vm_object_t object; +{ + int i; + vm_offset_t starta; + vm_offset_t addr; + vm_pindex_t pindex; + vm_page_t m; + int pageorder_index; + + if (entry->object.vm_object != object) + return; + + if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) + return; + + starta = addra - PFBAK * PAGE_SIZE; + if (starta < entry->start) { + starta = entry->start; + } else if (starta > addra) { + starta = 0; + } + + for (i = 0; i < PAGEORDER_SIZE; i++) { + vm_object_t lobject; + pt_entry_t *pte; + + addr = addra + pmap_prefault_pageorder[i]; + if (addr < starta || addr >= entry->end) + continue; + + pte = vtopte(addr); + if (*pte) + continue; + + pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; + lobject = object; + for (m = vm_page_lookup(lobject, pindex); + (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); + lobject = lobject->backing_object) { + if (lobject->backing_object_offset & (PAGE_MASK-1)) + break; + pindex += (lobject->backing_object_offset >> PAGE_SHIFT); + m = vm_page_lookup(lobject->backing_object, pindex); + } + + /* + * give-up when a page is not in memory + */ + if (m == NULL) + break; + + if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && + (m->busy == 0) && + (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { + + if (m->queue == PQ_CACHE) { + if (cnt.v_free_count + cnt.v_cache_count < + cnt.v_free_min) + break; + vm_page_deactivate(m); + } + vm_page_hold(m); + m->flags |= PG_MAPPED; + pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); + vm_page_unhold(m); + } + } +} + +/* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); - /* - * When unwiring, set the modified bit in the pte -- could have been - * changed by the kernel - */ - if (!wired) - (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); - *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); - bzero(CADDR2, NBPG); + *(int *) CMAP2 = PG_V | PG_KW | trunc_page(phys); + bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; - pmap_update(); + pmap_update_1pg((vm_offset_t) CADDR2); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); - *(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src); - *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); + *(int *) CMAP1 = PG_V | PG_KW | trunc_page(src); + *(int *) CMAP2 = PG_V | PG_KW | trunc_page(dst); #if __GNUC__ > 1 - memcpy(CADDR2, CADDR1, NBPG); + memcpy(CADDR2, CADDR1, PAGE_SIZE); #else - bcopy(CADDR1, CADDR2, NBPG); + bcopy(CADDR1, CADDR2, PAGE_SIZE); #endif *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; - pmap_update(); + pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static __inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark UPAGES as always modified, and ptes as never * modified. */ - if (bit & PG_U) { + if (bit & (PG_U|PG_M)) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } - if (bit & PG_M) { - if (pv->pv_va >= USRSTACK) { - if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { - continue; - } - if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) { - splx(s); - return TRUE; - } else if (pv->pv_va < KERNBASE) { - splx(s); - return FALSE; - } - } - } if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; + int changed; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", va); continue; } pte = pmap_pte(pv->pv_pmap, va); - if (setem) + if (setem) { (int) npte = (int) *pte | bit; - else + } else { (int) npte = (int) *pte & ~bit; + } *pte = npte; } } splx(s); - pmap_update(); + if (curproc != pageproc) + pmap_update(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * by any physical maps. */ boolean_t pmap_is_referenced(vm_offset_t pa) { return pmap_testbit((pa), PG_U); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_U, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; pt_entry_t *pte; pa = trunc_page(pa); size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { pte = vtopte(tmpva); *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N)); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } + +#ifdef PMAP_DEBUG +pmap_pid_dump(int pid) { + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = &p->p_vmspace->vm_pmap; + for(i=0;i<1024;i++) { + pd_entry_t *pde; + pt_entry_t *pte; + unsigned base = i << PD_SHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for(j=0;j<1024;j++) { + unsigned va = base + (j << PG_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + return npte; + } + pte = pmap_pte( pmap, va); + if (pte && pmap_pte_v(pte)) { + vm_offset_t pa; + vm_page_t m; + pa = *(int *)pte; + m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + return npte; +} +#endif #ifdef DEBUG static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PD_SHIFT) + (j << PG_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c (revision 13489) +++ head/sys/amd64/amd64/trap.c (revision 13490) @@ -1,1061 +1,1062 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $ + * $Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $ */ /* * 386 Trap and System call handling */ #include "opt_ktrace.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef POWERFAIL_NMI # include # include #endif #include "isa.h" #include "npx.h" int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void linux_syscall __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int)); static void trap_fatal __P((struct trapframe *)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 27 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ }; static void userret __P((struct proc *p, struct trapframe *frame, u_quad_t oticks)); static inline void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig, s; while ((sig = CURSIG(p)) != 0) postsig(sig); p->p_priority = p->p_usrpri; if (want_resched) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ s = splclock(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); splx(s); while ((sig = CURSIG(p)) != 0) postsig(sig); } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { u_quad_t ticks = p->p_sticks - oticks; if (ticks) { #ifdef PROFTIMER extern int profscale; addupc(frame->tf_eip, &p->p_stats->p_prof, ticks * profscale); #else addupc(frame->tf_eip, &p->p_stats->p_prof, ticks); #endif } } curpriority = p->p_priority; } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; #ifdef DEBUG u_long eva; #endif type = frame.tf_trapno; code = frame.tf_err; if (ISPL(frame.tf_cs) == SEL_UPL) { /* user trap */ sticks = p->p_sticks; p->p_md.md_regs = (int *)&frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ astoff(); cnt.v_soft++; if (p->p_flag & P_OWEUPC) { addupc(frame.tf_eip, &p->p_stats->p_prof, 1); p->p_flag &= ~P_OWEUPC; } goto out; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ case T_STKFLT: /* stack fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE); if (i == -1) return; if (i == 0) goto out; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV_TRAP; i = SIGFPE; break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI goto handle_powerfail; #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; panic("NMI indicates hardware failure"); #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF_TRAP; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_SUBRNG_TRAP; i = SIGFPE; break; case T_DNA: #if NNPX > 0 /* if a transparent fault (due to context switch "late") */ if (npxdna()) return; #endif /* NNPX > 0 */ if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) return; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE); return; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ #define MAYBE_DORETI_FAULT(where, whereto) \ do { \ if (frame.tf_eip == (int)where) { \ frame.tf_eip = (int)whereto; \ return; \ } \ } while (0) if (intr_nesting_level == 0) { MAYBE_DORETI_FAULT(doreti_iret, doreti_iret_fault); MAYBE_DORETI_FAULT(doreti_popl_ds, doreti_popl_ds_fault); MAYBE_DORETI_FAULT(doreti_popl_es, doreti_popl_es_fault); } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ return; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; return; } /* * Fall through. */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB if (kdb_trap (type, 0, &frame)) return; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif handle_powerfail: { static unsigned lastalert = 0; if(time.tv_sec - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time.tv_sec; } return; } #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame); return; } trapsignal(p, i, ucode); #ifdef DEBUG eva = rcr2(); if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%x", eva); uprintf("\n"); } #endif out: userret(p, &frame, sticks); } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t ptepg; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (curpcb == NULL || curpcb->pcb_onfault == NULL))) { trap_fatal(frame); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { vm_offset_t v; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) - (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); + (void) vm_fault(map, + trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame) struct trapframe *frame; { int code, type, eva; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; eva = rcr2(); sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace/trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } printf("interrupt mask = "); if ((cpl & net_imask) == net_imask) printf("net "); if ((cpl & tty_imask) == tty_imask) printf("tty "); if ((cpl & bio_imask) == bio_imask) printf("bio "); if (cpl == 0) printf("none"); printf("\n"); #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if (kdb_trap (type, 0, frame)) return; #endif if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { struct pcb *pcb = curpcb; if (pcb != NULL) { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip); printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp); printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp); } panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va, v; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; ++p->p_lock; if ((caddr_t)va >= vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { --p->p_lock; return (1); } } v = trunc_page(vtopte(va)); /* * wire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE); /* * unwire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE); } --p->p_lock; if (rv != KERN_SUCCESS) return 1; return (0); } /* * System call request from POSIX system call gate interface to kernel. * Like trap(), argument is call by reference. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int args[8], rval[2]; u_int code; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("syscall"); p->p_md.md_regs = (int *)&frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; if ((i = callp->sy_narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif goto bad; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif rval[0] = 0; rval[1] = frame.tf_edx; error = (*callp->sy_call)(p, args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_edx = rval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes. */ frame.tf_eip -= 7; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #if defined(COMPAT_LINUX) || defined(LINUX) void linux_syscall(frame) struct trapframe frame; { struct proc *p = curproc; struct sysent *callp; u_quad_t sticks; int error; int rval[2]; u_int code; struct linux_syscall_args { int arg1; int arg2; int arg3; int arg4; int arg5; } args; args.arg1 = frame.tf_ebx; args.arg2 = frame.tf_ecx; args.arg3 = frame.tf_edx; args.arg4 = frame.tf_esi; args.arg5 = frame.tf_edi; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("linux syscall"); p->p_md.md_regs = (int *)&frame; code = frame.tf_eax; if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args); #endif rval[0] = 0; error = (*callp->sy_call)(p, &args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* Reconstruct pc, subtract size of int 0x80 */ frame.tf_eip -= 2; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = -error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #endif /* COMPAT_LINUX || LINUX */ Index: head/sys/amd64/amd64/vm_machdep.c =================================================================== --- head/sys/amd64/amd64/vm_machdep.c (revision 13489) +++ head/sys/amd64/amd64/vm_machdep.c (revision 13490) @@ -1,871 +1,871 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ - * $Id: vm_machdep.c,v 1.49 1995/12/14 08:31:01 phk Exp $ + * $Id: vm_machdep.c,v 1.50 1996/01/05 20:12:23 wollman Exp $ */ #include "npx.h" #include "opt_bounce.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vm_fault_quick __P((caddr_t v, int prot)); #ifdef BOUNCE_BUFFERS static vm_offset_t vm_bounce_kva __P((int size, int waitok)); static void vm_bounce_kva_free __P((vm_offset_t addr, vm_offset_t size, int now)); static vm_offset_t vm_bounce_page_find __P((int count)); static void vm_bounce_page_free __P((vm_offset_t pa, int count)); static volatile int kvasfreecnt; caddr_t bouncememory; int bouncepages; static int bpwait; static vm_offset_t *bouncepa; static int bmwait, bmfreeing; #define BITS_IN_UNSIGNED (8*sizeof(unsigned)) static int bounceallocarraysize; static unsigned *bounceallocarray; static int bouncefree; #define SIXTEENMEG (4096*4096) #define MAXBKVA 1024 int maxbkva = MAXBKVA*NBPG; /* special list that can be used at interrupt time for eventual kva free */ static struct kvasfree { vm_offset_t addr; vm_offset_t size; } kvaf[MAXBKVA]; /* * get bounce buffer pages (count physically contiguous) * (only 1 inplemented now) */ static vm_offset_t vm_bounce_page_find(count) int count; { int bit; int s,i; if (count != 1) panic("vm_bounce_page_find -- no support for > 1 page yet!!!"); s = splbio(); retry: for (i = 0; i < bounceallocarraysize; i++) { if (bounceallocarray[i] != 0xffffffff) { bit = ffs(~bounceallocarray[i]); if (bit) { bounceallocarray[i] |= 1 << (bit - 1) ; bouncefree -= count; splx(s); return bouncepa[(i * BITS_IN_UNSIGNED + (bit - 1))]; } } } bpwait = 1; tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0); goto retry; } static void vm_bounce_kva_free(addr, size, now) vm_offset_t addr; vm_offset_t size; int now; { int s = splbio(); kvaf[kvasfreecnt].addr = addr; kvaf[kvasfreecnt].size = size; ++kvasfreecnt; if( now) { /* * this will do wakeups */ vm_bounce_kva(0,0); } else { if (bmwait) { /* * if anyone is waiting on the bounce-map, then wakeup */ wakeup((caddr_t) io_map); bmwait = 0; } } splx(s); } /* * free count bounce buffer pages */ static void vm_bounce_page_free(pa, count) vm_offset_t pa; int count; { int allocindex; int index; int bit; if (count != 1) panic("vm_bounce_page_free -- no support for > 1 page yet!!!"); for(index=0;indexb_flags & B_BOUNCE) { printf("vm_bounce_alloc: called recursively???\n"); return; } if (bp->b_bufsize < bp->b_bcount) { printf( "vm_bounce_alloc: b_bufsize(0x%lx) < b_bcount(0x%lx) !!\n", bp->b_bufsize, bp->b_bcount); panic("vm_bounce_alloc"); } /* * This is not really necessary * if( bp->b_bufsize != bp->b_bcount) { * printf("size: %d, count: %d\n", bp->b_bufsize, bp->b_bcount); * } */ vastart = (vm_offset_t) bp->b_data; vaend = (vm_offset_t) bp->b_data + bp->b_bufsize; - vapstart = i386_trunc_page(vastart); - vapend = i386_round_page(vaend); + vapstart = trunc_page(vastart); + vapend = round_page(vaend); countvmpg = (vapend - vapstart) / NBPG; /* * if any page is above 16MB, then go into bounce-buffer mode */ va = vapstart; for (i = 0; i < countvmpg; i++) { pa = pmap_kextract(va); if (pa >= SIXTEENMEG) ++dobounceflag; if( pa == 0) panic("vm_bounce_alloc: Unmapped page"); va += NBPG; } if (dobounceflag == 0) return; if (bouncepages < dobounceflag) panic("Not enough bounce buffers!!!"); /* * allocate a replacement kva for b_addr */ kva = vm_bounce_kva(countvmpg*NBPG, 1); #if 0 printf("%s: vapstart: %x, vapend: %x, countvmpg: %d, kva: %x ", (bp->b_flags & B_READ) ? "read":"write", vapstart, vapend, countvmpg, kva); #endif va = vapstart; for (i = 0; i < countvmpg; i++) { pa = pmap_kextract(va); if (pa >= SIXTEENMEG) { /* * allocate a replacement page */ vm_offset_t bpa = vm_bounce_page_find(1); pmap_kenter(kva + (NBPG * i), bpa); #if 0 printf("r(%d): (%x,%x,%x) ", i, va, pa, bpa); #endif /* * if we are writing, the copy the data into the page */ if ((bp->b_flags & B_READ) == 0) { bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG); } } else { /* * use original page */ pmap_kenter(kva + (NBPG * i), pa); } va += NBPG; } /* * flag the buffer as being bounced */ bp->b_flags |= B_BOUNCE; /* * save the original buffer kva */ bp->b_savekva = bp->b_data; /* * put our new kva into the buffer (offset by original offset) */ bp->b_data = (caddr_t) (((vm_offset_t) kva) | ((vm_offset_t) bp->b_savekva & (NBPG - 1))); #if 0 printf("b_savekva: %x, newva: %x\n", bp->b_savekva, bp->b_data); #endif return; } /* * hook into biodone to free bounce buffer */ void vm_bounce_free(bp) struct buf *bp; { int i; vm_offset_t origkva, bouncekva, bouncekvaend; /* * if this isn't a bounced buffer, then just return */ if ((bp->b_flags & B_BOUNCE) == 0) return; /* * This check is not necessary * if (bp->b_bufsize != bp->b_bcount) { * printf("vm_bounce_free: b_bufsize=%d, b_bcount=%d\n", * bp->b_bufsize, bp->b_bcount); * } */ origkva = (vm_offset_t) bp->b_savekva; bouncekva = (vm_offset_t) bp->b_data; /* printf("free: %d ", bp->b_bufsize); */ /* * check every page in the kva space for b_addr */ for (i = 0; i < bp->b_bufsize; ) { vm_offset_t mybouncepa; vm_offset_t copycount; - copycount = i386_round_page(bouncekva + 1) - bouncekva; - mybouncepa = pmap_kextract(i386_trunc_page(bouncekva)); + copycount = round_page(bouncekva + 1) - bouncekva; + mybouncepa = pmap_kextract(trunc_page(bouncekva)); /* * if this is a bounced pa, then process as one */ - if ( mybouncepa != pmap_kextract( i386_trunc_page( origkva))) { + if ( mybouncepa != pmap_kextract( trunc_page( origkva))) { vm_offset_t tocopy = copycount; if (i + tocopy > bp->b_bufsize) tocopy = bp->b_bufsize - i; /* * if this is a read, then copy from bounce buffer into original buffer */ if (bp->b_flags & B_READ) bcopy((caddr_t) bouncekva, (caddr_t) origkva, tocopy); /* * free the bounce allocation */ /* printf("(kva: %x, pa: %x)", bouncekva, mybouncepa); */ vm_bounce_page_free(mybouncepa, 1); } origkva += copycount; bouncekva += copycount; i += copycount; } /* printf("\n"); */ /* * add the old kva into the "to free" list */ - bouncekva= i386_trunc_page((vm_offset_t) bp->b_data); - bouncekvaend= i386_round_page((vm_offset_t)bp->b_data + bp->b_bufsize); + bouncekva= trunc_page((vm_offset_t) bp->b_data); + bouncekvaend= round_page((vm_offset_t)bp->b_data + bp->b_bufsize); /* printf("freeva: %d\n", (bouncekvaend - bouncekva) / NBPG); */ vm_bounce_kva_free( bouncekva, (bouncekvaend - bouncekva), 0); bp->b_data = bp->b_savekva; bp->b_savekva = 0; bp->b_flags &= ~B_BOUNCE; return; } /* * init the bounce buffer system */ void vm_bounce_init() { int i; kvasfreecnt = 0; if (bouncepages == 0) return; bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED; bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT); if (!bounceallocarray) panic("Cannot allocate bounce resource array"); bouncepa = malloc(bouncepages * sizeof(vm_offset_t), M_TEMP, M_NOWAIT); if (!bouncepa) panic("Cannot allocate physical memory array"); for(i=0;i= SIXTEENMEG) panic("bounce memory out of range"); if( pa == 0) panic("bounce memory not resident"); bouncepa[i] = pa; bounceallocarray[i/(8*sizeof(int))] &= ~(1<<(i%(8*sizeof(int)))); } bouncefree = bouncepages; } #endif /* BOUNCE_BUFFERS */ /* * quick version of vm_fault */ static void vm_fault_quick(v, prot) caddr_t v; int prot; { if (prot & VM_PROT_WRITE) subyte(v, fubyte(v)); else fubyte(v); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the kernel stack and pcb, making the child * ready to run, and marking it so that it can return differently * than the parent. Returns 1 in the child process, 0 in the parent. * We currently double-map the user area so that the stack is at the same * address in each process; in the future we will probably relocate * the frame pointers on the stack after copying. */ int cpu_fork(p1, p2) register struct proc *p1, *p2; { register struct user *up = p2->p_addr; int offset; /* * Copy pcb and stack from proc p1 to p2. * We do this as cheaply as possible, copying only the active * part of the stack. The stack and pcb need to agree; * this is tricky, as the final pcb is constructed by savectx, * but its frame isn't yet on the stack when the stack is copied. * swtch compensates for this when the child eventually runs. * This should be done differently, with a single call * that copies and updates the pcb+stack, * replacing the bcopy and savectx. */ p2->p_addr->u_pcb = p1->p_addr->u_pcb; offset = mvesp() - (int)kstack; bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset, (unsigned) ctob(UPAGES) - offset); p2->p_md.md_regs = p1->p_md.md_regs; pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb); /* * * Arrange for a non-local goto when the new process * is started, to resume here, returning nonzero from setjmp. */ if (savectx(&up->u_pcb, 1)) { /* * Return 1 in child. */ return (1); } return (0); } void cpu_exit(p) register struct proc *p; { #if NNPX > 0 npxexit(p); #endif /* NNPX */ cnt.v_swtch++; cpu_switch(p); panic("cpu_exit"); } void -cpu_wait(p) struct proc *p; { -/* extern vm_map_t upages_map; */ - +cpu_wait(p) + struct proc *p; +{ /* drop per-process resources */ - pmap_remove(vm_map_pmap(u_map), (vm_offset_t) p->p_addr, - ((vm_offset_t) p->p_addr) + ctob(UPAGES)); + pmap_qremove((vm_offset_t) p->p_addr, UPAGES); kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); vmspace_free(p->p_vmspace); } /* * Dump the machine specific header information at the start of a core dump. */ int cpu_coredump(p, vp, cred) struct proc *p; struct vnode *vp; struct ucred *cred; { return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p)); } #ifdef notyet static void setredzone(pte, vaddr) u_short *pte; caddr_t vaddr; { /* eventually do this by setting up an expand-down stack segment for ss0: selector, allowing stack access down to top of u. this means though that protection violations need to be handled thru a double fault exception that must do an integral task switch to a known good context, within which a dump can be taken. a sensible scheme might be to save the initial context used by sched (that has physical memory mapped 1:1 at bottom) and take the dump while still in mapped mode */ } #endif /* * Convert kernel VA to physical address */ u_long kvtop(void *addr) { vm_offset_t va; va = pmap_kextract((vm_offset_t)addr); if (va == 0) panic("kvtop: zero page frame"); return((int)va); } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. */ void vmapbuf(bp) register struct buf *bp; { register int npf; register caddr_t addr; int off; vm_offset_t kva; vm_offset_t pa; if ((bp->b_flags & B_PHYS) == 0) panic("vmapbuf"); /* * this is the kva that is to be used for * the temporary kernel mapping */ kva = (vm_offset_t) bp->b_saveaddr; for (addr = (caddr_t)trunc_page(bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE) { /* * do the vm_fault if needed, do the copy-on-write thing when * reading stuff off device into memory. */ vm_fault_quick(addr, (bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); pa = pmap_kextract((vm_offset_t) addr); if (pa == 0) panic("vmapbuf: page not present"); /* * hold the data page */ #ifdef DIAGNOSTIC if( VM_PAGE_TO_PHYS(PHYS_TO_VM_PAGE(pa)) != pa) panic("vmapbuf: confused PHYS_TO_VM_PAGE mapping"); #endif vm_page_hold(PHYS_TO_VM_PAGE(pa)); } addr = bp->b_saveaddr = bp->b_data; off = (int)addr & PGOFSET; npf = btoc(round_page(bp->b_bufsize + off)); bp->b_data = (caddr_t) (kva + off); while (npf--) { pa = pmap_kextract((vm_offset_t)addr); if (pa == 0) panic("vmapbuf: null page frame"); pmap_kenter(kva, trunc_page(pa)); addr += PAGE_SIZE; kva += PAGE_SIZE; } } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(bp) register struct buf *bp; { register caddr_t addr; vm_offset_t pa; if ((bp->b_flags & B_PHYS) == 0) panic("vunmapbuf"); for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += NBPG) pmap_kremove((vm_offset_t) addr); bp->b_data = bp->b_saveaddr; bp->b_saveaddr = NULL; /* * unhold the pde, and data pages */ for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += NBPG) { /* * release the data page */ pa = pmap_kextract((vm_offset_t) addr); vm_page_unhold(PHYS_TO_VM_PAGE(pa)); } } /* * Force reset the processor by invalidating the entire address space! */ void cpu_reset() { /* * Attempt to do a CPU reset via the keyboard controller, * do not turn of the GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ #ifndef BROKEN_KEYBOARD_RESET outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ printf("Keyboard reset did not work, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ #endif /* force a shutdown by unmapping entire address space ! */ bzero((caddr_t) PTD, NBPG); /* "good night, sweet prince .... " */ pmap_update(); /* NOTREACHED */ while(1); } /* * Grow the user stack to allow for 'sp'. This version grows the stack in * chunks of SGROWSIZ. */ int grow(p, sp) struct proc *p; u_int sp; { unsigned int nss; caddr_t v; struct vmspace *vm = p->p_vmspace; if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK) return (1); nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE); if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur) return (0); if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT, SGROWSIZ) < nss) { int grow_amount; /* * If necessary, grow the VM that the stack occupies * to allow for the rlimit. This allows us to not have * to allocate all of the VM up-front in execve (which * is expensive). * Grow the VM by the amount requested rounded up to * the nearest SGROWSIZ to provide for some hysteresis. */ grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ); v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT, SGROWSIZ) - grow_amount; /* * If there isn't enough room to extend by SGROWSIZ, then * just extend to the maximum size */ if (v < vm->vm_maxsaddr) { v = vm->vm_maxsaddr; grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT); } if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v, - grow_amount, FALSE) != KERN_SUCCESS)) { + grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) { return (0); } vm->vm_ssize += grow_amount >> PAGE_SHIFT; } return (1); } /* * prototype routine to implement the pre-zeroed page mechanism * this routine is called from the idle loop. */ int vm_page_zero_idle() { vm_page_t m; if ((cnt.v_free_count > cnt.v_interrupt_free_min) && (m = vm_page_queue_free.tqh_first)) { TAILQ_REMOVE(&vm_page_queue_free, m, pageq); enable_intr(); pmap_zero_page(VM_PAGE_TO_PHYS(m)); disable_intr(); TAILQ_INSERT_HEAD(&vm_page_queue_zero, m, pageq); + m->queue = PQ_ZERO; ++vm_page_zero_count; return 1; } return 0; } Index: head/sys/fs/msdosfs/msdosfs_denode.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_denode.c (revision 13489) +++ head/sys/fs/msdosfs/msdosfs_denode.c (revision 13490) @@ -1,728 +1,730 @@ -/* $Id: msdosfs_denode.c,v 1.14 1995/12/03 16:41:53 bde Exp $ */ +/* $Id: msdosfs_denode.c,v 1.15 1995/12/07 12:47:19 davidg Exp $ */ /* $NetBSD: msdosfs_denode.c,v 1.9 1994/08/21 18:44:00 ws Exp $ */ /*- * Copyright (C) 1994 Wolfgang Solfrank. * Copyright (C) 1994 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include /* defines "time" */ #include #include #include #include #include #include #include #include struct denode **dehashtbl; u_long dehash; /* size of hash table - 1 */ #define DEHASH(dev, deno) (((dev) + (deno)) & dehash) union _qcvt { quad_t qcvt; long val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } static struct denode * msdosfs_hashget __P((dev_t dev, u_long dirclust, u_long diroff)); static void msdosfs_hashins __P((struct denode *dep)); static void msdosfs_hashrem __P((struct denode *dep)); int msdosfs_init() { dehashtbl = hashinit(desiredvnodes/2, M_MSDOSFSMNT, &dehash); return 0; } static struct denode * msdosfs_hashget(dev, dirclust, diroff) dev_t dev; u_long dirclust; u_long diroff; { struct denode *dep; for (;;) for (dep = dehashtbl[DEHASH(dev, dirclust + diroff)];; dep = dep->de_next) { if (dep == NULL) return NULL; if (dirclust != dep->de_dirclust || diroff != dep->de_diroffset || dev != dep->de_dev || dep->de_refcnt == 0) continue; if (dep->de_flag & DE_LOCKED) { dep->de_flag |= DE_WANTED; (void) tsleep((caddr_t)dep, PINOD, "msdhgt", 0); break; } if (!vget(DETOV(dep), 1)) return dep; break; } /* NOTREACHED */ } static void msdosfs_hashins(dep) struct denode *dep; { struct denode **depp, *deq; depp = &dehashtbl[DEHASH(dep->de_dev, dep->de_dirclust + dep->de_diroffset)]; deq = *depp; if (deq) deq->de_prev = &dep->de_next; dep->de_next = deq; dep->de_prev = depp; *depp = dep; } static void msdosfs_hashrem(dep) struct denode *dep; { struct denode *deq; deq = dep->de_next; if (deq) deq->de_prev = dep->de_prev; *dep->de_prev = deq; #ifdef DIAGNOSTIC dep->de_next = NULL; dep->de_prev = NULL; #endif } /* * If deget() succeeds it returns with the gotten denode locked(). * * pmp - address of msdosfsmount structure of the filesystem containing * the denode of interest. The pm_dev field and the address of * the msdosfsmount structure are used. * dirclust - which cluster bp contains, if dirclust is 0 (root directory) * diroffset is relative to the beginning of the root directory, * otherwise it is cluster relative. * diroffset - offset past begin of cluster of denode we want * direntptr - address of the direntry structure of interest. If direntptr is * NULL, the block is read if necessary. * depp - returns the address of the gotten denode. */ int deget(pmp, dirclust, diroffset, direntptr, depp) struct msdosfsmount *pmp; /* so we know the maj/min number */ u_long dirclust; /* cluster this dir entry came from */ u_long diroffset; /* index of entry within the cluster */ struct direntry *direntptr; struct denode **depp; /* returns the addr of the gotten denode */ { int error; dev_t dev = pmp->pm_dev; struct mount *mntp = pmp->pm_mountp; struct denode *ldep; struct vnode *nvp; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("deget(pmp %p, dirclust %ld, diroffset %x, direntptr %p, depp %p)\n", pmp, dirclust, diroffset, direntptr, depp); #endif /* * If dir entry is given and refers to a directory, convert to * canonical form */ if (direntptr && (direntptr->deAttributes & ATTR_DIRECTORY)) { dirclust = getushort(direntptr->deStartCluster); if (dirclust == MSDOSFSROOT) diroffset = MSDOSFSROOT_OFS; else diroffset = 0; } /* * See if the denode is in the denode cache. Use the location of * the directory entry to compute the hash value. For subdir use * address of "." entry. for root dir use cluster MSDOSFSROOT, * offset MSDOSFSROOT_OFS * * NOTE: The check for de_refcnt > 0 below insures the denode being * examined does not represent an unlinked but still open file. * These files are not to be accessible even when the directory * entry that represented the file happens to be reused while the * deleted file is still open. */ ldep = msdosfs_hashget(dev, dirclust, diroffset); if (ldep) { *depp = ldep; return 0; } /* * Directory entry was not in cache, have to create a vnode and * copy it from the passed disk buffer. */ /* getnewvnode() does a VREF() on the vnode */ error = getnewvnode(VT_MSDOSFS, mntp, msdosfs_vnodeop_p, &nvp); if (error) { *depp = 0; return error; } MALLOC(ldep, struct denode *, sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK); bzero((caddr_t)ldep, sizeof *ldep); nvp->v_data = ldep; ldep->de_vnode = nvp; ldep->de_flag = 0; ldep->de_devvp = 0; ldep->de_lockf = 0; ldep->de_dev = dev; ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; fc_purge(ldep, 0); /* init the fat cache for this denode */ /* * Insert the denode into the hash queue and lock the denode so it * can't be accessed until we've read it in and have done what we * need to it. */ VOP_LOCK(nvp); msdosfs_hashins(ldep); /* * Copy the directory entry into the denode area of the vnode. */ if (dirclust == MSDOSFSROOT && diroffset == MSDOSFSROOT_OFS) { /* * Directory entry for the root directory. There isn't one, * so we manufacture one. We should probably rummage * through the root directory and find a label entry (if it * exists), and then use the time and date from that entry * as the time and date for the root denode. */ ldep->de_Attributes = ATTR_DIRECTORY; ldep->de_StartCluster = MSDOSFSROOT; ldep->de_FileSize = pmp->pm_rootdirsize * pmp->pm_BytesPerSec; /* * fill in time and date so that dos2unixtime() doesn't * spit up when called from msdosfs_getattr() with root * denode */ ldep->de_Time = 0x0000; /* 00:00:00 */ ldep->de_Date = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT) | (1 << DD_DAY_SHIFT); /* Jan 1, 1980 */ /* leave the other fields as garbage */ } else { bp = NULL; if (!direntptr) { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) return error; } DE_INTERNALIZE(ldep, direntptr); if (bp) brelse(bp); } /* * Fill in a few fields of the vnode and finish filling in the * denode. Then return the address of the found denode. */ ldep->de_pmp = pmp; ldep->de_devvp = pmp->pm_devvp; ldep->de_refcnt = 1; if (ldep->de_Attributes & ATTR_DIRECTORY) { /* * Since DOS directory entries that describe directories * have 0 in the filesize field, we take this opportunity * to find out the length of the directory and plug it into * the denode structure. */ u_long size; nvp->v_type = VDIR; if (ldep->de_StartCluster == MSDOSFSROOT) nvp->v_flag |= VROOT; else { error = pcbmap(ldep, 0xffff, 0, &size); if (error == E2BIG) { ldep->de_FileSize = size << pmp->pm_cnshift; error = 0; } else printf("deget(): pcbmap returned %d\n", error); } } else nvp->v_type = VREG; SETHIGH(ldep->de_modrev, mono_time.tv_sec); SETLOW(ldep->de_modrev, mono_time.tv_usec * 4294); VREF(ldep->de_devvp); *depp = ldep; return 0; } int deupdat(dep, tp, waitfor) struct denode *dep; struct timespec *tp; int waitfor; { int error; struct buf *bp; struct direntry *dirp; struct vnode *vp = DETOV(dep); #ifdef MSDOSFS_DEBUG printf("deupdat(): dep %p\n", dep); #endif /* * If the denode-modified and update-mtime bits are off, * or this denode is from a readonly filesystem, * or this denode is for a directory, * or the denode represents an open but unlinked file, * then don't do anything. DOS directory * entries that describe a directory do not ever get * updated. This is the way DOS treats them. */ if ((dep->de_flag & (DE_MODIFIED | DE_UPDATE)) == 0 || vp->v_mount->mnt_flag & MNT_RDONLY || dep->de_Attributes & ATTR_DIRECTORY || dep->de_refcnt <= 0) return 0; /* * Read in the cluster containing the directory entry we want to * update. */ error = readde(dep, &bp, &dirp); if (error) return error; /* * If the mtime is to be updated, put the passed in time into the * directory entry. */ if (dep->de_flag & DE_UPDATE) { dep->de_Attributes |= ATTR_ARCHIVE; unix2dostime(tp, &dep->de_Date, &dep->de_Time); } /* * The mtime is now up to date. The denode will be unmodifed soon. */ dep->de_flag &= ~(DE_MODIFIED | DE_UPDATE); /* * Copy the directory entry out of the denode into the cluster it * came from. */ DE_EXTERNALIZE(dirp, dep); /* * Write the cluster back to disk. If they asked for us to wait * for the write to complete, then use bwrite() otherwise use * bdwrite(). */ error = 0; /* note that error is 0 from above, but ... */ if (waitfor) error = bwrite(bp); else bdwrite(bp); return error; } /* * Truncate the file described by dep to the length specified by length. */ int detrunc(dep, length, flags, cred, p) struct denode *dep; u_long length; int flags; struct ucred *cred; struct proc *p; { int error; int allerror; int vflags; u_long eofentry; u_long chaintofree; daddr_t bn; int boff; int isadir = dep->de_Attributes & ATTR_DIRECTORY; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; struct timespec ts; #ifdef MSDOSFS_DEBUG printf("detrunc(): file %s, length %d, flags %d\n", dep->de_Name, length, flags); #endif /* * Disallow attempts to truncate the root directory since it is of * fixed size. That's just the way dos filesystems are. We use * the VROOT bit in the vnode because checking for the directory * bit and a startcluster of 0 in the denode is not adequate to * recognize the root directory at this point in a file or * directory's life. */ if (DETOV(dep)->v_flag & VROOT) { printf( "detrunc(): can't truncate root directory, clust %ld, offset %ld\n", dep->de_dirclust, dep->de_diroffset); return EINVAL; } - vnode_pager_setsize(DETOV(dep), length); - if (dep->de_FileSize < length) + if (dep->de_FileSize < length) { + vnode_pager_setsize(DETOV(dep), length); return deextend(dep, length, cred); + } /* * If the desired length is 0 then remember the starting cluster of * the file and set the StartCluster field in the directory entry * to 0. If the desired length is not zero, then get the number of * the last cluster in the shortened file. Then get the number of * the first cluster in the part of the file that is to be freed. * Then set the next cluster pointer in the last cluster of the * file to CLUST_EOFE. */ if (length == 0) { chaintofree = dep->de_StartCluster; dep->de_StartCluster = 0; eofentry = ~0; } else { error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): pcbmap fails %d\n", error); #endif return error; } } fc_purge(dep, (length + pmp->pm_crbomask) >> pmp->pm_cnshift); /* * If the new length is not a multiple of the cluster size then we * must zero the tail end of the new last cluster in case it * becomes part of the file again because of a seek. */ if ((boff = length & pmp->pm_crbomask) != 0) { /* * should read from file vnode or filesystem vnode * depending on if file or dir */ if (isadir) { bn = cntobn(pmp, eofentry); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); } else { bn = de_blk(pmp, length); error = bread(DETOV(dep), bn, pmp->pm_bpcluster, NOCRED, &bp); } if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): bread fails %d\n", error); #endif return error; } /* * is this the right place for it? */ bzero(bp->b_data + boff, pmp->pm_bpcluster - boff); if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } /* * Write out the updated directory entry. Even if the update fails * we free the trailing clusters. */ dep->de_FileSize = length; dep->de_flag |= DE_UPDATE; vflags = (length > 0 ? V_SAVE : 0) | V_SAVEMETA; vinvalbuf(DETOV(dep), vflags, cred, p, 0, 0); + vnode_pager_setsize(DETOV(dep), length); TIMEVAL_TO_TIMESPEC(&time, &ts); allerror = deupdat(dep, &ts, 1); #ifdef MSDOSFS_DEBUG printf("detrunc(): allerror %d, eofentry %d\n", allerror, eofentry); #endif /* * If we need to break the cluster chain for the file then do it * now. */ if (eofentry != ~0) { error = fatentry(FAT_GET_AND_SET, pmp, eofentry, &chaintofree, CLUST_EOFE); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): fatentry errors %d\n", error); #endif return error; } fc_setcache(dep, FC_LASTFC, (length - 1) >> pmp->pm_cnshift, eofentry); } /* * Now free the clusters removed from the file because of the * truncation. */ if (chaintofree != 0 && !MSDOSFSEOF(chaintofree)) freeclusterchain(pmp, chaintofree); return allerror; } /* * Extend the file described by dep to length specified by length. */ int deextend(dep, length, cred) struct denode *dep; off_t length; struct ucred *cred; { struct msdosfsmount *pmp = dep->de_pmp; u_long count; int error; struct timespec ts; /* * The root of a DOS filesystem cannot be extended. */ if (DETOV(dep)->v_flag & VROOT) return EINVAL; /* * Directories can only be extended by the superuser. * Is this really important? */ if (dep->de_Attributes & ATTR_DIRECTORY) { error = suser(cred, NULL); if (error) return error; } if (length <= dep->de_FileSize) panic("deextend: file too large"); /* * Compute the number of clusters to allocate. */ count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize); if (count > 0) { if (count > pmp->pm_freeclustercount) return ENOSPC; error = extendfile(dep, count, NULL, NULL, DE_CLEAR); if (error) { /* truncate the added clusters away again */ (void) detrunc(dep, dep->de_FileSize, 0, cred, NULL); return error; } } dep->de_flag |= DE_UPDATE; dep->de_FileSize = length; TIMEVAL_TO_TIMESPEC(&time, &ts); return deupdat(dep, &ts, 1); } /* * Move a denode to its correct hash queue after the file it represents has * been moved to a new directory. */ int reinsert(dep) struct denode *dep; { /* * Fix up the denode cache. If the denode is for a directory, * there is nothing to do since the hash is based on the starting * cluster of the directory file and that hasn't changed. If for a * file the hash is based on the location of the directory entry, * so we must remove it from the cache and re-enter it with the * hash based on the new location of the directory entry. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) { msdosfs_hashrem(dep); msdosfs_hashins(dep); } return 0; } int msdosfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); #ifdef MSDOSFS_DEBUG printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n", dep, dep->de_Name, dep->de_refcnt); #endif if (prtactive && vp->v_usecount != 0) vprint("msdosfs_reclaim(): pushing active", vp); /* * Remove the denode from the denode hash chain we are in. */ msdosfs_hashrem(dep); cache_purge(vp); /* * Indicate that one less file on the filesystem is open. */ if (dep->de_devvp) { vrele(dep->de_devvp); dep->de_devvp = 0; } dep->de_flag = 0; FREE(dep, M_MSDOSFSNODE); vp->v_data = NULL; return 0; } int msdosfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); int error = 0; struct timespec ts; #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]); #endif if (prtactive && vp->v_usecount != 0) vprint("msdosfs_inactive(): pushing active", vp); /* * Get rid of denodes related to stale file handles. Hmmm, what * does this really do? */ if (dep->de_Name[0] == SLOT_DELETED) { if ((vp->v_flag & VXLOCK) == 0) vgone(vp); return 0; } /* * If the file has been deleted and it is on a read/write * filesystem, then truncate the file, and mark the directory slot * as empty. (This may not be necessary for the dos filesystem.) */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %x, MNT_RDONLY %x\n", dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY); #endif VOP_LOCK(vp); if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { error = detrunc(dep, (u_long) 0, 0, NOCRED, NULL); dep->de_flag |= DE_UPDATE; dep->de_Name[0] = SLOT_DELETED; } if (dep->de_flag & (DE_MODIFIED | DE_UPDATE)) { TIMEVAL_TO_TIMESPEC(&time, &ts); deupdat(dep, &ts, 0); } VOP_UNLOCK(vp); dep->de_flag = 0; /* * If we are done with the denode, then reclaim it so that it can * be reused now. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vp->v_usecount, dep->de_Name[0]); #endif if (vp->v_usecount == 0 && dep->de_Name[0] == SLOT_DELETED) vgone(vp); return error; } Index: head/sys/fs/procfs/procfs_mem.c =================================================================== --- head/sys/fs/procfs/procfs_mem.c (revision 13489) +++ head/sys/fs/procfs/procfs_mem.c (revision 13490) @@ -1,246 +1,247 @@ /* * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 Sean Eric Fagan * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry and Sean Eric Fagan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_mem.c 8.4 (Berkeley) 1/21/94 * - * $Id: procfs_mem.c,v 1.13 1995/12/11 04:56:31 dyson Exp $ + * $Id: procfs_mem.c,v 1.14 1995/12/17 07:19:24 bde Exp $ */ /* * This is a lightly hacked and merged version * of sef's pread/pwrite functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int procfs_rwmem __P((struct proc *p, struct uio *uio)); static int procfs_rwmem(p, uio) struct proc *p; struct uio *uio; { int error; int writing; writing = uio->uio_rw == UIO_WRITE; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t map, tmap; vm_object_t object; vm_offset_t kva = 0; vm_offset_t uva; int page_offset; /* offset into page */ vm_offset_t pageno; /* page number */ vm_map_entry_t out_entry; vm_prot_t out_prot; vm_page_t m; boolean_t wired, single_use; vm_pindex_t pindex; u_int len; int fix_prot; uva = (vm_offset_t) uio->uio_offset; if (uva >= VM_MAXUSER_ADDRESS) { if (writing || (uva >= (VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE))) { error = 0; break; } } /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * Check the permissions for the area we're interested * in. */ fix_prot = 0; if (writing) fix_prot = !vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_WRITE); if (fix_prot) { /* * If the page is not writable, we make it so. * XXX It is possible that a page may *not* be * read/executable, if a process changes that! * We will assume, for now, that a page is either * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE. */ error = vm_map_protect(map, pageno, pageno + PAGE_SIZE, VM_PROT_ALL, 0); if (error) break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, writing ? VM_PROT_WRITE : VM_PROT_READ, &out_entry, &object, &pindex, &out_prot, &wired, &single_use); /* * We're done with tmap now. */ if (!error) vm_map_lookup_done(tmap, out_entry); /* * Fault the page in... */ if (!error && writing && object->backing_object) { m = vm_page_lookup(object, pindex); if (m == 0) error = vm_fault(map, pageno, VM_PROT_WRITE, FALSE); } /* Find space in kernel_map for the page we're interested in */ if (!error) error = vm_map_find(kernel_map, object, - IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1); + IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); if (!error) { /* * Neither vm_map_lookup() nor vm_map_find() appear * to add a reference count to the object, so we do * that here and now. */ vm_object_reference(object); /* * Mark the page we just found as pageable. */ error = vm_map_pageable(kernel_map, kva, kva + PAGE_SIZE, 0); /* * Now do the i/o move. */ if (!error) error = uiomove((caddr_t)(kva + page_offset), len, uio); vm_map_remove(kernel_map, kva, kva + PAGE_SIZE); } if (fix_prot) vm_map_protect(map, pageno, pageno + PAGE_SIZE, VM_PROT_READ|VM_PROT_EXECUTE, 0); } while (error == 0 && uio->uio_resid > 0); return (error); } /* * Copy data in and out of the target process. * We do this by mapping the process's page into * the kernel and then doing a uiomove direct * from the kernel address space. */ int procfs_domem(curp, p, pfs, uio) struct proc *curp; struct proc *p; struct pfsnode *pfs; struct uio *uio; { int error; if (uio->uio_resid == 0) return (0); error = procfs_rwmem(p, uio); return (error); } /* * Given process (p), find the vnode from which * it's text segment is being executed. * * It would be nice to grab this information from * the VM system, however, there is no sure-fire * way of doing that. Instead, fork(), exec() and * wait() all maintain the p_textvp field in the * process proc structure which contains a held * reference to the exec'ed vnode. */ struct vnode * procfs_findtextvp(p) struct proc *p; { return (p->p_textvp); } Index: head/sys/gnu/ext2fs/ext2_bmap.c =================================================================== --- head/sys/gnu/ext2fs/ext2_bmap.c (revision 13489) +++ head/sys/gnu/ext2fs/ext2_bmap.c (revision 13490) @@ -1,317 +1,317 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 - * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $ + * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; register daddr_t bn; daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ *runp = 0; maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1; } if (runb) { *runb = 0; } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) - brelse(bp); + bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparry: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], ((daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) - brelse(bp); + bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; register daddr_t bn; struct indir *ap; int *nump; { long metalbn, realbn; struct ufsmount *ump; int blockcnt, i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); blockcnt *= MNINDIR(ump); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(ump); off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; } if (nump) *nump = numlevels; return (0); } Index: head/sys/gnu/ext2fs/ext2_inode.c =================================================================== --- head/sys/gnu/ext2fs/ext2_inode.c (revision 13489) +++ head/sys/gnu/ext2fs/ext2_inode.c (revision 13490) @@ -1,550 +1,551 @@ /* * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ext2_inode.c 8.5 (Berkeley) 12/30/93 */ #if !defined(__FreeBSD__) #include "quota.h" #include "diagnostic.h" #else #include "opt_quota.h" #endif #include #include #include #include #include #include #include #include #include #if !defined(__FreeBSD__) #include #endif #include #include #include #include #include #include #include #include #include #include #include static int ext2_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int, long *)); int ext2_init() { return (ufs_init()); } /* * Update the access, modified, and inode change times as specified by the * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is * used to specify that the inode needs to be updated but that the times have * already been set. The access and modified times are taken from the second * and third parameters; the inode change time is always taken from the current * time. If waitfor is set, then wait for the disk write of the inode to * complete. */ int ext2_update(ap) struct vop_update_args /* { struct vnode *a_vp; struct timeval *a_access; struct timeval *a_modify; int a_waitfor; } */ *ap; { register struct ext2_sb_info *fs; struct buf *bp; struct inode *ip; int error; #if !defined(__FreeBSD__) struct timeval time; #endif ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); return (0); } if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) return (0); if (ip->i_flag & IN_ACCESS) ip->i_atime.ts_sec = ap->a_access->tv_sec; if (ip->i_flag & IN_UPDATE) { ip->i_mtime.ts_sec = ap->a_modify->tv_sec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { #if !defined(__FreeBSD__) get_time(&time); #endif ip->i_ctime.ts_sec = time.tv_sec; } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); fs = ip->i_e2fs; if (error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp)) { brelse(bp); return (error); } ext2_di2ei( &ip->i_din, (struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number))); /* if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0) return (bwrite(bp)); else { */ bdwrite(bp); return (0); /* } */ } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ int ext2_truncate(ap) struct vop_truncate_args /* { struct vnode *a_vp; off_t a_length; int a_flags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *ovp = ap->a_vp; register daddr_t lastblock; register struct inode *oip; daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; off_t length = ap->a_length; register struct ext2_sb_info *fs; struct buf *bp; int offset, size, level; long count, nblocks, vflags, blocksreleased = 0; struct timeval tv; register int i; int aflags, error, allerror; off_t osize; /* printf("ext2_truncate called %d to %d\n", VTOI(ovp)->i_number, ap->a_length); */ /* * negative file sizes will totally break the code below and * are not meaningful anyways. */ if (length < 0) return EFBIG; oip = VTOI(ovp); #if defined(__FreeBSD__) tv = time; #else get_time(&tv); #endif if (ovp->v_type == VLNK && oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { #if DIAGNOSTIC if (length != 0) panic("ext2_truncate: partial truncate of symlink"); #endif bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); oip->i_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 0)); } #if QUOTA if (error = getinoquota(oip)) return (error); #endif - vnode_pager_setsize(ovp, (u_long)length); fs = oip->i_e2fs; osize = oip->i_size; ext2_discard_prealloc(oip); /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of oszie is 0, length will be at least 1. */ if (osize < length) { offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; + vnode_pager_setsize(ovp, length); if (error = ext2_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, aflags)) return (error); oip->i_size = length; #if !defined(__FreeBSD__) (void) vnode_pager_uncache(ovp); #endif if (aflags & IO_SYNC) bwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundry, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessable again because * of subsequent file growth. */ /* I don't understand the comment above */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; if (error = ext2_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags)) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); #if !defined(__FreeBSD__) (void) vnode_pager_uncache(ovp); #endif bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (aflags & IO_SYNC) bwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->s_blocksize - 1) - 1; lastiblock[SINGLE] = lastblock - NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->s_blocksize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ext2_indirtrunc below. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks); for (level = TRIPLE; level >= SINGLE; level--) if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } for (i = NDADDR - 1; i > lastblock; i--) oip->i_db[i] = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT)) allerror = error; /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); oip->i_size = osize; vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); /* * Indirect blocks first. */ indir_lbn[SINGLE] = -NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ext2_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ext2_blkfree(oip, bn, fs->s_frag_size); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = NDADDR - 1; i > lastblock; i--) { register long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ext2_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("itrunc: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ext2_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #if DIAGNOSTIC for (level = SINGLE; level <= TRIPLE; level++) if (newblks[NDADDR + level] != oip->i_ib[level]) panic("itrunc1"); for (i = 0; i < NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("itrunc2"); if (length == 0 && (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) panic("itrunc3"); #endif /* DIAGNOSTIC */ /* * Put back the real size. */ oip->i_size = length; oip->i_blocks -= blocksreleased; if (oip->i_blocks < 0) /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; + vnode_pager_setsize(ovp, length); #if QUOTA (void) chkdq(oip, -blocksreleased, NOCRED, 0); #endif return (allerror); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ext2_indirtrunc(ip, lbn, dbn, lastbn, level, countp) register struct inode *ip; daddr_t lbn, lastbn; daddr_t dbn; int level; long *countp; { register int i; struct buf *bp; register struct ext2_sb_info *fs = ip->i_e2fs; register daddr_t *bap; struct vnode *vp; daddr_t *copy, nb, nlbn, last; long blkcount, factor; int nblocks, blocksreleased = 0; int error = 0, allerror = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->s_blocksize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->s_blocksize, 0, 0); if (bp->b_flags & (B_DONE | B_DELWRI)) { /* Braces must be here in case trace evaluates to nothing. */ #if !defined(__FreeBSD__) trace(TR_BREADHIT, pack(vp, fs->s_blocksize), lbn); #endif } else { #if !defined(__FreeBSD__) trace(TR_BREADMISS, pack(vp, fs->s_blocksize), lbn); get_proc()->p_stats->p_ru.ru_inblock++; /* pay for read */ #endif bp->b_flags |= B_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2_indirtrunc: bad buffer size"); bp->b_blkno = dbn; #if defined(__FreeBSD__) vfs_busy_pages(bp, 0); #endif VOP_STRATEGY(bp); error = biowait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (daddr_t *)bp->b_data; MALLOC(copy, daddr_t *, fs->s_blocksize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->s_blocksize); bzero((caddr_t)&bap[last + 1], (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; error = bwrite(bp); if (error) allerror = error; bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount)) allerror = error; blocksreleased += blkcount; } ext2_blkfree(ip, nb, fs->s_blocksize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) { if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount)) allerror = error; blocksreleased += blkcount; } } FREE(copy, M_TEMP); *countp = blocksreleased; return (allerror); } /* * discard preallocated blocks */ int ext2_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { ext2_discard_prealloc(VTOI(ap->a_vp)); return ufs_inactive(ap); } Index: head/sys/gnu/ext2fs/ext2_readwrite.c =================================================================== --- head/sys/gnu/ext2fs/ext2_readwrite.c (revision 13489) +++ head/sys/gnu/ext2fs/ext2_readwrite.c (revision 13490) @@ -1,323 +1,326 @@ /* * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.7 (Berkeley) 1/21/94 */ #if !defined(__FreeBSD__) #include "diagnostic.h" #endif #define BLKSIZE(a, b, c) blksize(a, b, c) #define FS struct ext2_sb_info #define I_FS i_e2fs #define READ ext2_read #define READ_S "ext2_read" #define WRITE ext2_write #define WRITE_S "ext2_write" /* * Vnode op for reading. */ /* ARGSUSED */ static int READ(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct inode *ip; register struct uio *uio; register FS *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; u_short mode; vp = ap->a_vp; ip = VTOI(vp); mode = ip->i_mode; uio = ap->a_uio; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("%s: mode", READ_S); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("%s: short symlink", READ_S); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", READ_S, vp->v_type); #endif fs = ip->I_FS; #if 0 if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize) return (EFBIG); #endif for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = BLKSIZE(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->s_frag_size - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) > ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if (doclusterread) error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); else if (lbn - 1 == vp->v_lastr) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); - if (error) + if (error) { + brelse(bp); + bp = NULL; break; + } vp->v_lastr = lbn; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; #if !defined(__FreeBSD__) if (S_ISREG(mode) && (xfersize + blkoffset == fs->s_frag_size || uio->uio_offset == ip->i_size)) bp->b_flags |= B_AGE; #endif - brelse(bp); + bqrelse(bp); } if (bp != NULL) - brelse(bp); + bqrelse(bp); ip->i_flag |= IN_ACCESS; return (error); } /* * Vnode op for writing. */ static int WRITE(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct uio *uio; register struct inode *ip; register FS *fs; struct buf *bp; struct proc *p; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("%s: mode", WRITE_S); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: if ((ioflag & IO_SYNC) == 0) panic("%s: nonsync dir write", WRITE_S); break; default: panic("%s: type", WRITE_S); } fs = ip->I_FS; #if 0 if (uio->uio_offset < 0 || (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); #endif /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ p = uio->uio_procp; if (vp->v_type == VREG && p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); return (EFBIG); } resid = uio->uio_resid; osize = ip->i_size; flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->s_frag_size - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; #if defined(__FreeBSD__) if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, (u_long)uio->uio_offset + xfersize); #endif if (fs->s_frag_size > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error) break; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; #if !defined(__FreeBSD__) vnode_pager_setsize(vp, (u_long)ip->i_size); #endif } #if !defined(__FreeBSD__) (void)vnode_pager_uncache(vp); #endif size = BLKSIZE(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->s_frag_size) { if (doclusterwrite) { #if defined(__FreeBSD__) bp->b_flags |= B_CLUSTEROK; #endif cluster_write(bp, ip->i_size); } else { #if !defined(__FreeBSD__) bp->b_flags |= B_AGE; #endif bawrite(bp); } } else { #if defined(__FreeBSD__) if (doclusterwrite) bp->b_flags |= B_CLUSTEROK; #endif bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_mode &= ~(ISUID | ISGID); if (error) { if (ioflag & IO_UNIT) { (void)VOP_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { struct timeval tv; #if !defined(__FreeBSD__) get_time(&tv); #else tv = time; #endif error = VOP_UPDATE(vp, &tv, &tv, 1); } return (error); } Index: head/sys/gnu/fs/ext2fs/ext2_bmap.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_bmap.c (revision 13489) +++ head/sys/gnu/fs/ext2fs/ext2_bmap.c (revision 13490) @@ -1,317 +1,317 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 - * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $ + * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; register daddr_t bn; daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ *runp = 0; maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1; } if (runb) { *runb = 0; } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) - brelse(bp); + bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparry: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], ((daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) - brelse(bp); + bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; register daddr_t bn; struct indir *ap; int *nump; { long metalbn, realbn; struct ufsmount *ump; int blockcnt, i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); blockcnt *= MNINDIR(ump); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(ump); off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; } if (nump) *nump = numlevels; return (0); } Index: head/sys/gnu/fs/ext2fs/ext2_inode.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_inode.c (revision 13489) +++ head/sys/gnu/fs/ext2fs/ext2_inode.c (revision 13490) @@ -1,550 +1,551 @@ /* * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ext2_inode.c 8.5 (Berkeley) 12/30/93 */ #if !defined(__FreeBSD__) #include "quota.h" #include "diagnostic.h" #else #include "opt_quota.h" #endif #include #include #include #include #include #include #include #include #include #if !defined(__FreeBSD__) #include #endif #include #include #include #include #include #include #include #include #include #include #include static int ext2_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int, long *)); int ext2_init() { return (ufs_init()); } /* * Update the access, modified, and inode change times as specified by the * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is * used to specify that the inode needs to be updated but that the times have * already been set. The access and modified times are taken from the second * and third parameters; the inode change time is always taken from the current * time. If waitfor is set, then wait for the disk write of the inode to * complete. */ int ext2_update(ap) struct vop_update_args /* { struct vnode *a_vp; struct timeval *a_access; struct timeval *a_modify; int a_waitfor; } */ *ap; { register struct ext2_sb_info *fs; struct buf *bp; struct inode *ip; int error; #if !defined(__FreeBSD__) struct timeval time; #endif ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); return (0); } if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) return (0); if (ip->i_flag & IN_ACCESS) ip->i_atime.ts_sec = ap->a_access->tv_sec; if (ip->i_flag & IN_UPDATE) { ip->i_mtime.ts_sec = ap->a_modify->tv_sec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { #if !defined(__FreeBSD__) get_time(&time); #endif ip->i_ctime.ts_sec = time.tv_sec; } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); fs = ip->i_e2fs; if (error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->s_blocksize, NOCRED, &bp)) { brelse(bp); return (error); } ext2_di2ei( &ip->i_din, (struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE * ino_to_fsbo(fs, ip->i_number))); /* if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0) return (bwrite(bp)); else { */ bdwrite(bp); return (0); /* } */ } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ int ext2_truncate(ap) struct vop_truncate_args /* { struct vnode *a_vp; off_t a_length; int a_flags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *ovp = ap->a_vp; register daddr_t lastblock; register struct inode *oip; daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; off_t length = ap->a_length; register struct ext2_sb_info *fs; struct buf *bp; int offset, size, level; long count, nblocks, vflags, blocksreleased = 0; struct timeval tv; register int i; int aflags, error, allerror; off_t osize; /* printf("ext2_truncate called %d to %d\n", VTOI(ovp)->i_number, ap->a_length); */ /* * negative file sizes will totally break the code below and * are not meaningful anyways. */ if (length < 0) return EFBIG; oip = VTOI(ovp); #if defined(__FreeBSD__) tv = time; #else get_time(&tv); #endif if (ovp->v_type == VLNK && oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { #if DIAGNOSTIC if (length != 0) panic("ext2_truncate: partial truncate of symlink"); #endif bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); oip->i_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 0)); } #if QUOTA if (error = getinoquota(oip)) return (error); #endif - vnode_pager_setsize(ovp, (u_long)length); fs = oip->i_e2fs; osize = oip->i_size; ext2_discard_prealloc(oip); /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of oszie is 0, length will be at least 1. */ if (osize < length) { offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; + vnode_pager_setsize(ovp, length); if (error = ext2_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, aflags)) return (error); oip->i_size = length; #if !defined(__FreeBSD__) (void) vnode_pager_uncache(ovp); #endif if (aflags & IO_SYNC) bwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundry, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessable again because * of subsequent file growth. */ /* I don't understand the comment above */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; if (error = ext2_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags)) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); #if !defined(__FreeBSD__) (void) vnode_pager_uncache(ovp); #endif bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (aflags & IO_SYNC) bwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->s_blocksize - 1) - 1; lastiblock[SINGLE] = lastblock - NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->s_blocksize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ext2_indirtrunc below. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks); for (level = TRIPLE; level >= SINGLE; level--) if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } for (i = NDADDR - 1; i > lastblock; i--) oip->i_db[i] = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT)) allerror = error; /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); oip->i_size = osize; vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); /* * Indirect blocks first. */ indir_lbn[SINGLE] = -NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ext2_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ext2_blkfree(oip, bn, fs->s_frag_size); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = NDADDR - 1; i > lastblock; i--) { register long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ext2_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("itrunc: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ext2_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #if DIAGNOSTIC for (level = SINGLE; level <= TRIPLE; level++) if (newblks[NDADDR + level] != oip->i_ib[level]) panic("itrunc1"); for (i = 0; i < NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("itrunc2"); if (length == 0 && (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) panic("itrunc3"); #endif /* DIAGNOSTIC */ /* * Put back the real size. */ oip->i_size = length; oip->i_blocks -= blocksreleased; if (oip->i_blocks < 0) /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; + vnode_pager_setsize(ovp, length); #if QUOTA (void) chkdq(oip, -blocksreleased, NOCRED, 0); #endif return (allerror); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ext2_indirtrunc(ip, lbn, dbn, lastbn, level, countp) register struct inode *ip; daddr_t lbn, lastbn; daddr_t dbn; int level; long *countp; { register int i; struct buf *bp; register struct ext2_sb_info *fs = ip->i_e2fs; register daddr_t *bap; struct vnode *vp; daddr_t *copy, nb, nlbn, last; long blkcount, factor; int nblocks, blocksreleased = 0; int error = 0, allerror = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->s_blocksize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->s_blocksize, 0, 0); if (bp->b_flags & (B_DONE | B_DELWRI)) { /* Braces must be here in case trace evaluates to nothing. */ #if !defined(__FreeBSD__) trace(TR_BREADHIT, pack(vp, fs->s_blocksize), lbn); #endif } else { #if !defined(__FreeBSD__) trace(TR_BREADMISS, pack(vp, fs->s_blocksize), lbn); get_proc()->p_stats->p_ru.ru_inblock++; /* pay for read */ #endif bp->b_flags |= B_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2_indirtrunc: bad buffer size"); bp->b_blkno = dbn; #if defined(__FreeBSD__) vfs_busy_pages(bp, 0); #endif VOP_STRATEGY(bp); error = biowait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (daddr_t *)bp->b_data; MALLOC(copy, daddr_t *, fs->s_blocksize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->s_blocksize); bzero((caddr_t)&bap[last + 1], (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; error = bwrite(bp); if (error) allerror = error; bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount)) allerror = error; blocksreleased += blkcount; } ext2_blkfree(ip, nb, fs->s_blocksize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) { if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount)) allerror = error; blocksreleased += blkcount; } } FREE(copy, M_TEMP); *countp = blocksreleased; return (allerror); } /* * discard preallocated blocks */ int ext2_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { ext2_discard_prealloc(VTOI(ap->a_vp)); return ufs_inactive(ap); } Index: head/sys/gnu/fs/ext2fs/ext2_readwrite.c =================================================================== --- head/sys/gnu/fs/ext2fs/ext2_readwrite.c (revision 13489) +++ head/sys/gnu/fs/ext2fs/ext2_readwrite.c (revision 13490) @@ -1,323 +1,326 @@ /* * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /* * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.7 (Berkeley) 1/21/94 */ #if !defined(__FreeBSD__) #include "diagnostic.h" #endif #define BLKSIZE(a, b, c) blksize(a, b, c) #define FS struct ext2_sb_info #define I_FS i_e2fs #define READ ext2_read #define READ_S "ext2_read" #define WRITE ext2_write #define WRITE_S "ext2_write" /* * Vnode op for reading. */ /* ARGSUSED */ static int READ(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct inode *ip; register struct uio *uio; register FS *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; u_short mode; vp = ap->a_vp; ip = VTOI(vp); mode = ip->i_mode; uio = ap->a_uio; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("%s: mode", READ_S); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("%s: short symlink", READ_S); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", READ_S, vp->v_type); #endif fs = ip->I_FS; #if 0 if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize) return (EFBIG); #endif for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = BLKSIZE(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->s_frag_size - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) > ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if (doclusterread) error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); else if (lbn - 1 == vp->v_lastr) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); - if (error) + if (error) { + brelse(bp); + bp = NULL; break; + } vp->v_lastr = lbn; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; #if !defined(__FreeBSD__) if (S_ISREG(mode) && (xfersize + blkoffset == fs->s_frag_size || uio->uio_offset == ip->i_size)) bp->b_flags |= B_AGE; #endif - brelse(bp); + bqrelse(bp); } if (bp != NULL) - brelse(bp); + bqrelse(bp); ip->i_flag |= IN_ACCESS; return (error); } /* * Vnode op for writing. */ static int WRITE(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct uio *uio; register struct inode *ip; register FS *fs; struct buf *bp; struct proc *p; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); #if DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("%s: mode", WRITE_S); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: if ((ioflag & IO_SYNC) == 0) panic("%s: nonsync dir write", WRITE_S); break; default: panic("%s: type", WRITE_S); } fs = ip->I_FS; #if 0 if (uio->uio_offset < 0 || (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); #endif /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ p = uio->uio_procp; if (vp->v_type == VREG && p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); return (EFBIG); } resid = uio->uio_resid; osize = ip->i_size; flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->s_frag_size - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; #if defined(__FreeBSD__) if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, (u_long)uio->uio_offset + xfersize); #endif if (fs->s_frag_size > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error) break; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; #if !defined(__FreeBSD__) vnode_pager_setsize(vp, (u_long)ip->i_size); #endif } #if !defined(__FreeBSD__) (void)vnode_pager_uncache(vp); #endif size = BLKSIZE(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->s_frag_size) { if (doclusterwrite) { #if defined(__FreeBSD__) bp->b_flags |= B_CLUSTEROK; #endif cluster_write(bp, ip->i_size); } else { #if !defined(__FreeBSD__) bp->b_flags |= B_AGE; #endif bawrite(bp); } } else { #if defined(__FreeBSD__) if (doclusterwrite) bp->b_flags |= B_CLUSTEROK; #endif bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_mode &= ~(ISUID | ISGID); if (error) { if (ioflag & IO_UNIT) { (void)VOP_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { struct timeval tv; #if !defined(__FreeBSD__) get_time(&tv); #else tv = time; #endif error = VOP_UPDATE(vp, &tv, &tv, 1); } return (error); } Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 13489) +++ head/sys/i386/i386/machdep.c (revision 13490) @@ -1,1820 +1,1820 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.168 1996/01/04 21:10:53 wollman Exp $ + * $Id: machdep.c,v 1.169 1996/01/05 20:12:19 wollman Exp $ */ #include "npx.h" #include "isa.h" #include "opt_sysvipc.h" #include "opt_ddb.h" #include "opt_bounce.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void init386 __P((int first)); extern int ptrace_set_pc __P((struct proc *p, unsigned int addr)); extern int ptrace_single_step __P((struct proc *p)); extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data)); extern void dblfault_handler __P((void)); extern void i486_bzero __P((void *, size_t)); extern void i586_bzero __P((void *, size_t)); extern void i686_bzero __P((void *, size_t)); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static void identifycpu(void); char machine[] = "i386"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, ""); static char cpu_model[128]; SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, ""); struct kern_devconf kdc_cpu0 = { 0, 0, 0, /* filled in by dev_attach */ "cpu", 0, { MDDT_CPU }, 0, 0, 0, CPU_EXTERNALLEN, 0, /* CPU has no parent */ 0, /* no parentdata */ DC_BUSY, /* the CPU is always busy */ cpu_model, /* no sense in duplication */ DC_CLS_CPU /* class */ }; #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif #ifdef BOUNCE_BUFFERS extern char *bouncememory; extern int maxbkva; #ifdef BOUNCEPAGES int bouncepages = BOUNCEPAGES; #else int bouncepages = 0; #endif #endif /* BOUNCE_BUFFERS */ extern int freebufspace; int msgbufmapped = 0; /* set when safe to use msgbuf */ int _udatasel, _ucodesel; int physmem = 0; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); int boothowto = 0, bootverbose = 0, Maxmem = 0; static int badpages = 0; long dumplo; extern int bootdev; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) int cpu_class; static void dumpsys __P((void)); static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */ static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; extern struct linker_set netisr_set; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Initialize error message buffer (at end of core). */ /* avail_end was pre-decremented in init_386() to compensate */ for (i = 0; i < btoc(sizeof (struct msgbuf)); i++) pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp, avail_end + i * NBPG, VM_PROT_ALL, TRUE); msgbufmapped = 1; /* * Good {morning,afternoon,evening,night}. */ printf(version); startrtclock(); identifycpu(); printf("real memory = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (badpages != 0) { int indx = 1; /* * XXX skip reporting ISA hole & unmanaged kernel memory */ if (phys_avail[0] == PAGE_SIZE) indx += 2; printf("Physical memory hole(s):\n"); for (; phys_avail[indx + 1] != 0; indx += 2) { int size = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size, size / PAGE_SIZE); } } /* * Quickly wire in netisrs. */ setup_netisrs(&netisr_set); /* #ifdef ISDN DONET(isdnintr, NETISR_ISDN); #endif */ /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 12, 1024); } nswbuf = min(nbuf, 128); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); #ifdef BOUNCE_BUFFERS /* * If there is more than 16MB of memory, allocate some bounce buffers */ if (Maxmem > 4096) { if (bouncepages == 0) { bouncepages = 64; bouncepages += ((Maxmem - 4096) / 2048) * 32; } v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)); valloc(bouncememory, char, bouncepages * PAGE_SIZE); } #endif /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); #ifdef BOUNCE_BUFFERS clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + maxbkva + pager_map_size, TRUE); io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE); #else clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE); #endif buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*MAXBSIZE), TRUE); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size, TRUE); exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*ARG_MAX), TRUE); u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (maxproc*UPAGES*PAGE_SIZE), FALSE); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ mclrefcnt = (char *)malloc(nmbclusters+CLBYTES/MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, nmbclusters+CLBYTES/MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, nmbclusters * MCLBYTES, FALSE); /* * Initialize callouts */ callfree = callout; for (i = 1; i < ncallout; i++) callout[i-1].c_next = &callout[i]; if (boothowto & RB_CONFIG) { userconfig(); cninit(); /* the preferred console may have changed */ } #ifdef BOUNCE_BUFFERS /* * init bounce buffers */ vm_bounce_init(); #endif /* * XXX allocate a contiguous area for ISA (non busmaster) DMA * operations. This _should_ only be done if the DMA channels * will actually be used, but for now we do it always. */ #define DMAPAGES 8 isaphysmem = vm_page_alloc_contig(DMAPAGES * PAGE_SIZE, 0, 0xfffffful, 64*1024); printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); /* * In verbose mode, print out the BIOS's idea of the disk geometries. */ if (bootverbose) { printf("BIOS Geometries:\n"); for (i = 0; i < N_BIOS_GEOM; i++) { unsigned long bios_geom; int max_cylinder, max_head, max_sector; bios_geom = bootinfo.bi_bios_geom[i]; /* * XXX the bootstrap punts a 1200K floppy geometry * when the get-disk-geometry interrupt fails. Skip * drives that have this geometry. */ if (bios_geom == 0x4f010f) continue; printf(" %x:%08lx ", i, bios_geom); max_cylinder = bios_geom >> 16; max_head = (bios_geom >> 8) & 0xff; max_sector = bios_geom & 0xff; printf( "0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n", max_cylinder, max_cylinder + 1, max_head, max_head + 1, max_sector, max_sector); } printf(" %d accounted for\n", bootinfo.bi_n_bios_used); } } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } static void setup_netisrs(ls) struct linker_set *ls; { int i; const struct netisrtab *nit; for(i = 0; ls->ls_items[i]; i++) { nit = (const struct netisrtab *)ls->ls_items[i]; register_netisr(nit->nit_num, nit->nit_isr); } } static struct cpu_nameclass i386_cpus[] = { { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ { "i486SX", CPUCLASS_486 }, /* CPU_486SX */ { "i486DX", CPUCLASS_486 }, /* CPU_486 */ { "Pentium", CPUCLASS_586 }, /* CPU_586 */ { "Cy486DLC", CPUCLASS_486 }, /* CPU_486DLC */ { "Pentium Pro", CPUCLASS_686 }, /* CPU_686 */ }; static void identifycpu() { printf("CPU: "); if (cpu >= 0 && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) { cpu_class = i386_cpus[cpu].cpu_class; strncpy(cpu_model, i386_cpus[cpu].cpu_name, sizeof cpu_model); } else { printf("unknown cpu type %d\n", cpu); panic("startup: bad cpu id"); } #if defined(I586_CPU) || defined(I686_CPU) if (cpu_class == CPUCLASS_586 || cpu_class == CPUCLASS_686) { calibrate_cyclecounter(); } #endif #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) if (!strcmp(cpu_vendor,"GenuineIntel")) { if ((cpu_id & 0xf00) > 3) { cpu_model[0] = '\0'; switch (cpu_id & 0x3000) { case 0x1000: strcpy(cpu_model, "Overdrive "); break; case 0x2000: strcpy(cpu_model, "Dual "); break; } switch (cpu_id & 0xf00) { case 0x400: strcat(cpu_model, "i486 "); break; case 0x500: strcat(cpu_model, "Pentium"); /* nb no space */ break; case 0x600: strcat(cpu_model, "Pentium Pro"); break; default: strcat(cpu_model, "unknown"); break; } switch (cpu_id & 0xff0) { case 0x400: strcat(cpu_model, "DX"); break; case 0x410: strcat(cpu_model, "DX"); break; case 0x420: strcat(cpu_model, "SX"); break; case 0x430: strcat(cpu_model, "DX2"); break; case 0x440: strcat(cpu_model, "SL"); break; case 0x450: strcat(cpu_model, "SX2"); break; case 0x470: strcat(cpu_model, "DX2 Write-Back Enhanced"); break; case 0x480: strcat(cpu_model, "DX4"); break; break; } } } #endif printf("%s (", cpu_model); switch(cpu_class) { case CPUCLASS_286: printf("286"); break; #if defined(I386_CPU) case CPUCLASS_386: printf("386"); break; #endif #if defined(I486_CPU) case CPUCLASS_486: printf("486"); bzero = i486_bzero; break; #endif #if defined(I586_CPU) case CPUCLASS_586: printf("%d.%02d-MHz ", ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("586"); bzero = i586_bzero; break; #endif #if defined(I686_CPU) case CPUCLASS_686: printf("%d.%02d-MHz ", ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100, ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100); printf("686"); bzero = i686_bzero; break; #endif default: printf("unknown"); /* will panic below... */ } printf("-class CPU)\n"); #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU) if(*cpu_vendor) printf(" Origin = \"%s\"",cpu_vendor); if(cpu_id) printf(" Id = 0x%lx",cpu_id); if (!strcmp(cpu_vendor, "GenuineIntel")) { printf(" Stepping=%ld", cpu_id & 0xf); if (cpu_high > 0) { #define FEATUREFMT "\020\001FPU\002VME\003PSE\004MCE\005CX8\006APIC" printf("\n Features=0x%b", cpu_feature, FEATUREFMT); } } /* Avoid ugly blank lines: only print newline when we have to. */ if (*cpu_vendor || cpu_id) printf("\n"); #endif /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { case CPUCLASS_286: /* a 286 should not make it this far, anyway */ #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU) #error This kernel is not configured for one of the supported CPUs #endif #if !defined(I386_CPU) case CPUCLASS_386: #endif #if !defined(I486_CPU) case CPUCLASS_486: #endif #if !defined(I586_CPU) case CPUCLASS_586: #endif #if !defined(I686_CPU) case CPUCLASS_686: #endif panic("CPU class not configured"); default: break; } dev_attach(&kdc_cpu0); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; unsigned code; { register struct proc *p = curproc; register int *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; /* * Allocate and validate space for the signal handler * context. Note that if the stack is in P0 space, the * call to grow() is a nop, and the useracc() check * will fail if the process has not already allocated * the space with a `brk'. */ if ((psp->ps_flags & SAS_ALTSTACK) && (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SA_ONSTACK; } else { fp = (struct sigframe *)(regs[tESP] - sizeof(struct sigframe)); } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow(p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs[tERR]; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs[tEAX]; sf.sf_sc.sc_ebx = regs[tEBX]; sf.sf_sc.sc_ecx = regs[tECX]; sf.sf_sc.sc_edx = regs[tEDX]; sf.sf_sc.sc_esi = regs[tESI]; sf.sf_sc.sc_edi = regs[tEDI]; sf.sf_sc.sc_cs = regs[tCS]; sf.sf_sc.sc_ds = regs[tDS]; sf.sf_sc.sc_ss = regs[tSS]; sf.sf_sc.sc_es = regs[tES]; sf.sf_sc.sc_isp = regs[tISP]; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs[tESP]; sf.sf_sc.sc_fp = regs[tEBP]; sf.sf_sc.sc_pc = regs[tEIP]; sf.sf_sc.sc_ps = regs[tEFLAGS]; /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); }; regs[tESP] = (int)fp; regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc; regs[tEFLAGS] &= ~PSL_VM; regs[tCS] = _ucodesel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tSS] = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap, retval) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; int *retval; { register struct sigcontext *scp; register struct sigframe *fp; register int *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs[tESP] points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0) return(EINVAL); /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = scp->sc_ps; /* * XXX do allow users to change the privileged flag PSL_RF. The * cpu sets PSL_RF in tf_eflags for faults. Debuggers should * sometimes set it there too. tf_eflags is kept in the signal * context during signal handling and there is no other place * to remember it, so the PSL_RF bit may be corrupted by the * signal handler without us knowing. Corruption of the PSL_RF * bit at worst causes one more or one less debugger trap, so * allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs[tEFLAGS] & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } /* restore scratch registers */ regs[tEAX] = scp->sc_eax; regs[tEBX] = scp->sc_ebx; regs[tECX] = scp->sc_ecx; regs[tEDX] = scp->sc_edx; regs[tESI] = scp->sc_esi; regs[tEDI] = scp->sc_edi; regs[tCS] = scp->sc_cs; regs[tDS] = scp->sc_ds; regs[tES] = scp->sc_es; regs[tSS] = scp->sc_ss; regs[tISP] = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK; p->p_sigmask = scp->sc_mask &~ (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP)); regs[tEBP] = scp->sc_fp; regs[tESP] = scp->sc_sp; regs[tEIP] = scp->sc_pc; regs[tEFLAGS] = eflags; return(EJUSTRETURN); } static int waittime = -1; static struct pcb dumppcb; __dead void boot(howto) int howto; { if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { register struct buf *bp; int iter, nbusy; waittime = 0; printf("\nsyncing disks... "); sync(&proc0, NULL, NULL); for (iter = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { nbusy++; } } if (nbusy == 0) break; printf("%d ", nbusy); DELAY(40000 * iter); } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("giving up\n"); #ifdef SHOW_BUSYBUFS nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { nbusy++; printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno); } } DELAY(5000000); /* 5 seconds */ #endif } else { printf("done\n"); /* * Unmount filesystems */ if (panicstr == 0) vfs_unmountall(); } DELAY(100000); /* wait for console output to finish */ dev_shutdownall(FALSE); } splhigh(); if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); cngetc(); } else { if (howto & RB_DUMP) { if (!cold) { savectx(&dumppcb, 0); dumppcb.pcb_ptd = rcr3(); dumpsys(); } if (PANIC_REBOOT_WAIT_TIME != 0) { if (PANIC_REBOOT_WAIT_TIME != -1) { int loop; printf("Automatic reboot in %d seconds - press a key on the console to abort\n", PANIC_REBOOT_WAIT_TIME); for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ if (cncheckc()) /* Did user type a key? */ break; } if (!loop) goto die; } } else { /* zero time specified - reboot NOW */ goto die; } printf("--> Press a key on the console to reboot <--\n"); cngetc(); } } die: printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ cpu_reset(); for(;;) ; /* NOTREACHED */ } /* * Magic number for savecore * * exported (symorder) and used at least by savecore(8) * */ u_long dumpmag = 0x8fca0101UL; static int dumpsize = 0; /* also for savecore */ static int dodump = 1; SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, ""); /* * Doadump comes here after turning off memory management and * getting on the dump stack, either when called above, or by * the auto-restart code. */ static void dumpsys() { if (!dodump) return; if (dumpdev == NODEV) return; if ((minor(dumpdev)&07) != 1) return; if (!(bdevsw[major(dumpdev)])) return; if (!(bdevsw[major(dumpdev)]->d_dump)) return; dumpsize = Maxmem; printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo); printf("dump "); switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) { case ENXIO: printf("device bad\n"); break; case EFAULT: printf("device not ready\n"); break; case EINVAL: printf("area improper\n"); break; case EIO: printf("i/o error\n"); break; case EINTR: printf("aborted from console\n"); break; default: printf("succeeded\n"); break; } } /* * Clear registers on exec */ void setregs(p, entry, stack) struct proc *p; u_long entry; u_long stack; { int *regs = p->p_md.md_regs; bzero(regs, sizeof(struct trapframe)); regs[tEIP] = entry; regs[tESP] = stack; regs[tEFLAGS] = PSL_USER | (regs[tEFLAGS] & PSL_T); regs[tSS] = _udatasel; regs[tDS] = _udatasel; regs[tES] = _udatasel; regs[tCS] = _ucodesel; p->p_addr->u_pcb.pcb_flags = 0; /* no fp at all */ load_cr0(rcr0() | CR0_TS); /* start emulating */ #if NNPX > 0 npxinit(__INITIAL_NPXCW__); #endif /* NNPX > 0 */ } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int currentldt; int _default_ldt; union descriptor gdt[NGDT]; /* global descriptor table */ struct gate_descriptor idt[NIDT]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 3 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 4 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 5 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ { (int) kstack, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 7 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 10 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall); #if defined(COMPAT_LINUX) || defined(LINUX) extern inthand_t IDTVEC(linux_syscall); #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void init386(first) int first; { int x; unsigned biosbasemem, biosextmem; struct gate_descriptor *gdp; int gsel_tss; /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; int pagesinbase, pagesinext; int target_page, pa_indx; proc0.p_addr = proc0paddr; /* * Initialize the console before we print anything out. */ cninit(); /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was - * i386_btop(i386_round_page(etext)) - 1. + * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * NBPG) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; /* Note. eventually want private ldts per process */ for (x = 0; x < NLDT; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #if defined(COMPAT_LINUX) || defined(LINUX) setidt(0x80, &IDTVEC(linux_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); r_gdt.rd_limit = sizeof(gdt) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); r_idt.rd_limit = sizeof(idt) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); currentldt = _default_ldt; #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif /* Use BIOS values stored in RTC CMOS RAM, since probing * breaks certain 386 AT relics. */ biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8); biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8); /* * Print a warning if the official BIOS interface disagrees * with the hackish interface used above. Eventually only * the official interface should be used. */ if (bootinfo.bi_memsizes_valid) { if (bootinfo.bi_basemem != biosbasemem) printf("BIOS basemem (%ldK) != RTC basemem (%dK)\n", bootinfo.bi_basemem, biosbasemem); if (bootinfo.bi_extmem != biosextmem) printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n", bootinfo.bi_extmem, biosextmem); } /* * If BIOS tells us that it has more than 640k in the basemem, * don't believe it - set it to 640k. */ if (biosbasemem > 640) biosbasemem = 640; /* * Some 386 machines might give us a bogus number for extended * mem. If this happens, stop now. */ #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); /* NOTREACHED */ } #endif pagesinbase = biosbasemem * 1024 / NBPG; pagesinext = biosextmem * 1024 / NBPG; /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. */ /* * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((pagesinext > 3840) && (pagesinext < 4096)) pagesinext = 3840; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of of the physical address space. It */ Maxmem = pagesinext + 0x100000/PAGE_SIZE; #ifdef MAXMEM Maxmem = MAXMEM/4; #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap (first, 0); /* * Size up each available chunk of physical memory. */ /* * We currently don't bother testing base memory. * XXX ...but we probably should. */ pa_indx = 0; badpages = 0; if (pagesinbase > 1) { phys_avail[pa_indx++] = PAGE_SIZE; /* skip first page of memory */ phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */ physmem = pagesinbase - 1; } else { /* point at first chunk end */ pa_indx++; } for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) { int tmp, page_bad = FALSE; /* * map page into kernel: valid, read/write, non-cacheable */ *(int *)CMAP1 = PG_V | PG_KW | PG_N | target_page; pmap_update(); tmp = *(int *)CADDR1; /* * Test for alternating 1's and 0's */ *(volatile int *)CADDR1 = 0xaaaaaaaa; if (*(volatile int *)CADDR1 != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)CADDR1 = 0x55555555; if (*(volatile int *)CADDR1 != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)CADDR1 = 0xffffffff; if (*(volatile int *)CADDR1 != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)CADDR1 = 0x0; if (*(volatile int *)CADDR1 != 0x0) { /* * test of page failed */ page_bad = TRUE; } /* * Restore original value. */ *(int *)CADDR1 = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == FALSE) { /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. */ if (phys_avail[pa_indx] == target_page) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = target_page; /* start */ phys_avail[pa_indx] = target_page + PAGE_SIZE; /* end */ } physmem++; } else { badpages++; page_bad = FALSE; } } *(int *)CMAP1 = 0; pmap_update(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf)); avail_end = phys_avail[pa_indx]; /* now running on new page tables, configured,and u/iom is accessible */ /* make a initial tss so microp can get interrupt stack on syscall! */ proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG; proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); ((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt = (sizeof(struct i386tss))<<16; ltr(gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode); proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_ptd = IdlePTD; } /* * The registers are in the frame; the frame is in the user area of * the process in question; when the process is active, the registers * are in "the kernel stack"; when it's not, they're still there, but * things get flipped around. So, since p->p_md.md_regs is the whole address * of the register set, take its offset from the kernel stack, and * index into the user block. Don't you just *love* virtual memory? * (I'm starting to think seymour is right...) */ #define TF_REGP(p) ((struct trapframe *) \ ((char *)(p)->p_addr \ + ((char *)(p)->p_md.md_regs - kstack))) int ptrace_set_pc(p, addr) struct proc *p; unsigned int addr; { TF_REGP(p)->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { TF_REGP(p)->tf_eflags |= PSL_T; return (0); } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; int data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - kstack; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = TF_REGP(p); frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct trapframe *tp; tp = TF_REGP(p); regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct trapframe *tp; tp = TF_REGP(p); if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include #define b_cylin b_resid /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } /* calculate cylinder for disksort to order transfers with */ bp->b_pblkno = bp->b_blkno + p->p_offset; bp->b_cylin = bp->b_pblkno / lp->d_secpercyl; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } int disk_externalize(int drive, struct sysctl_req *req) { return SYSCTL_OUT(req, &drive, sizeof drive); } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 13489) +++ head/sys/i386/i386/pmap.c (revision 13490) @@ -1,1954 +1,2167 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - * $Id: pmap.c,v 1.71 1995/12/17 07:19:15 bde Exp $ + * $Id: pmap.c,v 1.72 1995/12/22 18:21:26 bde Exp $ */ /* * Derived from hp300 version by Mike Hibler, this version by William * Jolitz uses a recursive map [a pde points to the page directory] to * map the page tables using the pagetables themselves. This is done to * reduce the impact on kernel virtual memory for lots of sparse address * space, and to reduce the cost of memory to each process. * * Derived from: hp300/@(#)pmap.c 7.1 (Berkeley) 12/5/90 */ /* * Major modifications by John S. Dyson primarily to support * pageable page tables, eliminating pmap_attributes, * discontiguous memory pages, and using more efficient string * instructions. Jan 13, 1994. Further modifications on Mar 2, 1994, * general clean-up and efficiency mods. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#define PMAP_KEEP_PDIRS + +static void init_pv_entries __P((int)); + /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023])) #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023]) #define pmap_pte_pa(pte) (*(int *)(pte) & PG_FRAME) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_U) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; static struct pmap kernel_pmap_store; pmap_t kernel_pmap; vm_offset_t avail_start; /* PA of first available physical page */ vm_offset_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static vm_offset_t vm_first_phys; static int nkpt; extern vm_offset_t clean_sva, clean_eva; extern int cpu_class; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1; static pt_entry_t *CMAP2, *ptmmap; static pv_entry_t pv_table; caddr_t CADDR1, ptvmmap; static caddr_t CADDR2; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp; static void free_pv_entry __P((pv_entry_t pv)); static pt_entry_t * get_pt_entry __P((pmap_t pmap)); static pv_entry_t get_pv_entry __P((void)); static void i386_protection_init __P((void)); -static void init_pv_entries __P((int npg)); static void pmap_alloc_pv_entry __P((void)); static void pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem)); static void pmap_enter_quick __P((pmap_t pmap, vm_offset_t va, vm_offset_t pa)); static int pmap_is_managed __P((vm_offset_t pa)); static void pmap_remove_all __P((vm_offset_t pa)); static void pmap_remove_entry __P((struct pmap *pmap, pv_entry_t pv, vm_offset_t va)); static vm_page_t pmap_pte_vm_page __P((pmap_t pmap, vm_offset_t pt)); static boolean_t pmap_testbit __P((vm_offset_t pa, int bit)); /* + * The below are finer grained pmap_update routines. These eliminate + * the gratuitious tlb flushes on non-i386 architectures. + */ +static __inline void +pmap_update_1pg( vm_offset_t va) { +#if defined(I386_CPU) + if (cpuclass == CPUCLASS_I386) + pmap_update(); + else +#endif + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va)); +} + +static __inline void +pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) { +#if defined(I386_CPU) + if (cpuclass == CPUCLASS_I386) { + pmap_update(); + } else +#endif + { + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1)); + __asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2)); + } +} + +/* * Routine: pmap_pte * Function: * Extract the page table entry associated * with the given map/virtual_address pair. * [ what about induced faults -wfj] */ -inline pt_entry_t * __pure +__inline pt_entry_t * __pure pmap_pte(pmap, va) register pmap_t pmap; vm_offset_t va; { if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) return ((pt_entry_t *) vtopte(va)); /* otherwise, we are alternate address space */ else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return ((pt_entry_t *) avtopte(va)); } } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_offset_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_offset_t pa; if (pmap && *pmap_pde(pmap, va)) { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME))) { pa = *(int *) vtopte(va); /* otherwise, we are alternate address space */ } else { if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } pa = *(int *) avtopte(va); } return ((pa & PG_FRAME) | (va & ~PG_FRAME)); } return 0; } /* * determine if a page is managed (memory vs. device) */ -static inline int +static __inline int pmap_is_managed(pa) vm_offset_t pa; { int i; if (!pmap_initialized) return 0; for (i = 0; phys_avail[i + 1]; i += 2) { if (pa >= phys_avail[i] && pa < phys_avail[i + 1]) return 1; } return 0; } /* * find the vm_page_t of a pte (only) given va of pte and pmap */ static __inline vm_page_t pmap_pte_vm_page(pmap, pt) pmap_t pmap; vm_offset_t pt; { vm_page_t m; - pt = i386_trunc_page(pt); - pt = (pt - UPT_MIN_ADDRESS) / NBPG; + pt = trunc_page(pt); + pt = (pt - UPT_MIN_ADDRESS) / PAGE_SIZE; pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME; m = PHYS_TO_VM_PAGE(pt); return m; } /* * Wire a page table page */ __inline void pmap_use_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); vm_page_hold(pmap_pte_vm_page(pmap, pt)); } /* * Unwire a page table page */ -inline void +__inline void pmap_unuse_pt(pmap, va) pmap_t pmap; vm_offset_t va; { vm_offset_t pt; vm_page_t m; if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized) return; pt = (vm_offset_t) vtopte(va); m = pmap_pte_vm_page(pmap, pt); vm_page_unhold(m); if (pmap != kernel_pmap && (m->hold_count == 0) && (m->wire_count == 0) && (va < KPT_MIN_ADDRESS)) { +/* + * We don't free page-table-pages anymore because it can have a negative + * impact on perf at times. Now we just deactivate, and it'll get cleaned + * up if needed... Also, if the page ends up getting used, it will fault + * back into the process address space and be reactivated. + */ +#ifdef PMAP_FREE_OLD_PTES pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); vm_page_free(m); +#else + m->dirty = 0; + vm_page_deactivate(m); +#endif } } /* [ macro again?, should I force kstack into user map here? -wfj ] */ void pmap_activate(pmap, pcbp) register pmap_t pmap; struct pcb *pcbp; { PMAP_ACTIVATE(pmap, pcbp); } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_offset_t firstaddr; vm_offset_t loadaddr; { vm_offset_t va; pt_entry_t *pte; avail_start = firstaddr; /* - * XXX The calculation of virtual_avail is wrong. It's NKPT*NBPG too + * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * The kernel's pmap is statically allocated so we don't have to use * pmap_create, which is unlikely to work correctly at this part of * the boot sequence (XXX and which no longer exists). */ kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD); kernel_pmap->pm_count = 1; nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*NBPG); p = pte; pte += (n); + v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = pmap_pte(kernel_pmap, va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) /* * ptmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufmap is used to map the system message buffer. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1) virtual_avail = va; *(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0; pmap_update(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_offset_t phys_start, phys_end; { vm_offset_t addr; vm_size_t npg, s; int i; /* * calculate the number of pv_entries needed */ vm_first_phys = phys_avail[0]; for (i = 0; phys_avail[i + 1]; i += 2); - npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG; + npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ s = (vm_size_t) (sizeof(struct pv_entry) * npg); - s = i386_round_page(s); + s = round_page(s); addr = (vm_offset_t) kmem_alloc(kernel_map, s); pv_table = (pv_entry_t) addr; /* * init the pv free list */ init_pv_entries(npg); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Used to map a range of physical addresses into kernel * virtual address space. * * For now, VM is already on, we only need to map the * specified memory. */ vm_offset_t pmap_map(virt, start, end, prot) vm_offset_t virt; vm_offset_t start; vm_offset_t end; int prot; { while (start < end) { pmap_enter(kernel_pmap, virt, start, prot, FALSE); virt += PAGE_SIZE; start += PAGE_SIZE; } return (virt); } +#ifdef PMAP_KEEP_PDIRS +int nfreepdir; +caddr_t *pdirlist; +#define NFREEPDIR 3 + +static void * +pmap_getpdir() { + caddr_t *pdir; + if (pdirlist) { + --nfreepdir; + pdir = pdirlist; + pdirlist = (caddr_t *) *pdir; + bzero( (caddr_t) pdir, PAGE_SIZE); + } else { + pdir = (caddr_t *) kmem_alloc(kernel_map, PAGE_SIZE); + } + + return (void *) pdir; +} + +static void +pmap_freepdir(void *pdir) { + if (nfreepdir > NFREEPDIR) { + kmem_free(kernel_map, (vm_offset_t) pdir, PAGE_SIZE); + } else { + * (caddr_t *) pdir = (caddr_t) pdirlist; + pdirlist = (caddr_t *) pdir; + ++nfreepdir; + } +} +#endif + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { /* * No need to allocate page table space yet but we do need a valid * page directory table. */ + +#ifdef PMAP_KEEP_PDIRS + pmap->pm_pdir = pmap_getpdir(); +#else pmap->pm_pdir = (pd_entry_t *) kmem_alloc(kernel_map, PAGE_SIZE); +#endif /* wire in kernel global address entries */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE); /* install self-referential address mapping entry */ *(int *) (pmap->pm_pdir + PTDPTDI) = ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW; pmap->pm_count = 1; } /* * grow the number of kernel page table entries, if needed */ static vm_page_t nkpg; vm_offset_t kernel_vm_end; void pmap_growkernel(vm_offset_t addr) { struct proc *p; struct pmap *pmap; int s; s = splhigh(); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); ++nkpt; } } - addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } ++nkpt; if (!nkpg) { nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); vm_page_wire(nkpg); vm_page_remove(nkpg); pmap_zero_page(VM_PAGE_TO_PHYS(nkpg)); } pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW); nkpg = NULL; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if (p->p_vmspace) { pmap = &p->p_vmspace->vm_pmap; *pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); } } *pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end); - kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1); + kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /* * Retire the given physical map from service. * Should only be called if the map contains * no valid mappings. */ void pmap_destroy(pmap) register pmap_t pmap; { int count; if (pmap == NULL) return; count = --pmap->pm_count; if (count == 0) { pmap_release(pmap); free((caddr_t) pmap, M_VMPMAP); } } /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap) register struct pmap *pmap; { +#ifdef PMAP_KEEP_PDIRS + pmap_freepdir( (void *)pmap->pm_pdir); +#else kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE); +#endif } /* * Add a reference to the specified pmap. */ void pmap_reference(pmap) pmap_t pmap; { if (pmap != NULL) { pmap->pm_count++; } } -#define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2) +#define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2) /* * Data for the pv entry allocation mechanism */ static int pv_freelistcnt; static pv_entry_t pv_freelist; static vm_offset_t pvva; static int npvvapg; /* * free the pv_entry back to the free list */ -inline static void +static __inline void free_pv_entry(pv) pv_entry_t pv; { if (!pv) return; ++pv_freelistcnt; pv->pv_next = pv_freelist; pv_freelist = pv; } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ -static inline pv_entry_t +static __inline pv_entry_t get_pv_entry() { pv_entry_t tmp; /* * get more pv_entry pages if needed */ if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) { pmap_alloc_pv_entry(); } /* * get a pv_entry off of the free list */ --pv_freelistcnt; tmp = pv_freelist; pv_freelist = tmp->pv_next; return tmp; } /* * this *strange* allocation routine *statistically* eliminates the * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure. * also -- this code is MUCH MUCH faster than the malloc equiv... */ static void pmap_alloc_pv_entry() { /* * do we have any pre-allocated map-pages left? */ if (npvvapg) { vm_page_t m; /* * we do this to keep recursion away */ pv_freelistcnt += PV_FREELIST_MIN; /* * allocate a physical page out of the vm system */ m = vm_page_alloc(kernel_object, OFF_TO_IDX(pvva - vm_map_min(kernel_map)), VM_ALLOC_INTERRUPT); if (m) { int newentries; int i; pv_entry_t entry; - newentries = (NBPG / sizeof(struct pv_entry)); + newentries = (PAGE_SIZE / sizeof(struct pv_entry)); /* * wire the page */ vm_page_wire(m); m->flags &= ~PG_BUSY; /* * let the kernel see it */ pmap_kenter(pvva, VM_PAGE_TO_PHYS(m)); entry = (pv_entry_t) pvva; /* * update the allocation pointers */ - pvva += NBPG; + pvva += PAGE_SIZE; --npvvapg; /* * free the entries into the free list */ for (i = 0; i < newentries; i++) { free_pv_entry(entry); entry++; } } pv_freelistcnt -= PV_FREELIST_MIN; } if (!pv_freelist) panic("get_pv_entry: cannot get a pv_entry_t"); } /* * init the pv_entry allocation system */ #define PVSPERPAGE 64 void init_pv_entries(npg) int npg; { /* * allocate enough kvm space for PVSPERPAGE entries per page (lots) * kvm space is fairly cheap, be generous!!! (the system can panic if * this is too small.) */ - npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG; - pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG); + npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + + PAGE_SIZE - 1) / PAGE_SIZE; + pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE); /* * get the first batch of entries */ free_pv_entry(get_pv_entry()); } static pt_entry_t * get_pt_entry(pmap) pmap_t pmap; { vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME; /* are we current address space or kernel? */ if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) { return PTmap; } /* otherwise, we are alternate address space */ if (frame != ((int) APTDpde & PG_FRAME)) { APTDpde = pmap->pm_pdir[PTDPTDI]; pmap_update(); } return APTmap; } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static void pmap_remove_entry(pmap, pv, va) struct pmap *pmap; pv_entry_t pv; vm_offset_t va; { pv_entry_t npv; int s; s = splhigh(); if (pmap == pv->pv_pmap && va == pv->pv_va) { npv = pv->pv_next; if (npv) { *pv = *npv; free_pv_entry(npv); } else { pv->pv_pmap = NULL; } } else { - for (npv = pv->pv_next; npv; npv = npv->pv_next) { + for (npv = pv->pv_next; npv; (pv = npv, npv = pv->pv_next)) { if (pmap == npv->pv_pmap && va == npv->pv_va) { break; } - pv = npv; } if (npv) { pv->pv_next = npv->pv_next; free_pv_entry(npv); } } splx(s); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap, sva, eva) struct pmap *pmap; register vm_offset_t sva; register vm_offset_t eva; { register pt_entry_t *ptp, *ptq; vm_offset_t pa; register pv_entry_t pv; vm_offset_t va; pt_entry_t oldpte; if (pmap == NULL) return; ptp = get_pt_entry(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ - if ((sva + NBPG) == eva) { + if ((sva + PAGE_SIZE) == eva) { if (*pmap_pde(pmap, sva) == 0) return; ptq = ptp + i386_btop(sva); if (!*ptq) return; /* * Update statistics */ if (pmap_pte_w(ptq)) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; pa = pmap_pte_pa(ptq); oldpte = *ptq; *ptq = 0; if (pmap_is_managed(pa)) { if ((int) oldpte & PG_M) { - if (sva < USRSTACK + (UPAGES * NBPG) || + if (sva < USRSTACK + (UPAGES * PAGE_SIZE) || (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, sva); } pmap_unuse_pt(pmap, sva); - pmap_update(); + pmap_update_1pg(sva); return; } sva = i386_btop(sva); eva = i386_btop(eva); while (sva < eva) { /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (*pmap_pde(pmap, i386_ptob(sva)) == 0) { /* We can race ahead here, straight to next pde.. */ sva = ((sva + NPTEPG) & ~(NPTEPG - 1)); continue; } ptq = ptp + sva; /* * search for page table entries, use string operations that * are much faster than explicitly scanning when page tables * are not fully populated. */ if (*ptq == 0) { vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1)); vm_offset_t nscan = pdnxt - sva; int found = 0; if ((nscan + sva) > eva) nscan = eva - sva; asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx"); if (!found) { sva = pdnxt; continue; } ptq -= 1; sva = ptq - ptp; } /* * Update statistics */ oldpte = *ptq; if (((int) oldpte) & PG_W) pmap->pm_stats.wired_count--; pmap->pm_stats.resident_count--; /* * Invalidate the PTEs. XXX: should cluster them up and * invalidate as many as possible at once. */ *ptq = 0; va = i386_ptob(sva); /* * Remove from the PV table (raise IPL since we may be called * at interrupt time). */ pa = ((int) oldpte) & PG_FRAME; if (!pmap_is_managed(pa)) { - pmap_unuse_pt(pmap, va); + pmap_unuse_pt(pmap, (vm_offset_t) va); ++sva; continue; } if ((int) oldpte & PG_M) { - if (sva < USRSTACK + (UPAGES * NBPG) || + if (sva < USRSTACK + (UPAGES * PAGE_SIZE) || (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } pv = pa_to_pvh(pa); pmap_remove_entry(pmap, pv, va); pmap_unuse_pt(pmap, va); ++sva; } pmap_update(); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void pmap_remove_all(pa) vm_offset_t pa; { - register pv_entry_t pv, npv; + register pv_entry_t pv, opv, npv; register pt_entry_t *pte, *ptp; vm_offset_t va; struct pmap *pmap; vm_page_t m; int s; int anyvalid = 0; /* * Not one of ours */ /* * XXX this makes pmap_page_protect(NONE) illegal for non-managed * pages! */ if (!pmap_is_managed(pa)) return; - pa = i386_trunc_page(pa); - pv = pa_to_pvh(pa); - m = PHYS_TO_VM_PAGE(pa); + pa = trunc_page(pa); + opv = pa_to_pvh(pa); + if (opv->pv_pmap == NULL) + return; + m = PHYS_TO_VM_PAGE(pa); s = splhigh(); - while (pv->pv_pmap != NULL) { - pmap = pv->pv_pmap; + pv = opv; + while (pv && ((pmap = pv->pv_pmap) != NULL)) { ptp = get_pt_entry(pmap); va = pv->pv_va; pte = ptp + i386_btop(va); if (pmap_pte_w(pte)) pmap->pm_stats.wired_count--; if (*pte) { pmap->pm_stats.resident_count--; - anyvalid++; + if (curproc != pageproc) + anyvalid++; /* * Update the vm_page_t clean and reference bits. */ if ((int) *pte & PG_M) { - if (va < USRSTACK + (UPAGES * NBPG) || + if (va < USRSTACK + (UPAGES * PAGE_SIZE) || (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) { PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL; } } *pte = 0; pmap_unuse_pt(pmap, va); } + pv = pv->pv_next; + } + + for (pv = opv->pv_next; pv; pv = npv) { npv = pv->pv_next; - if (npv) { - *pv = *npv; - free_pv_entry(npv); - } else { - pv->pv_pmap = NULL; - } + free_pv_entry(pv); } + + opv->pv_pmap = NULL; + opv->pv_next = NULL; + splx(s); if (anyvalid) pmap_update(); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap, sva, eva, prot) register pmap_t pmap; vm_offset_t sva, eva; vm_prot_t prot; { register pt_entry_t *pte; register vm_offset_t va; int i386prot; register pt_entry_t *ptp; int evap = i386_btop(eva); int anyvalid = 0;; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; ptp = get_pt_entry(pmap); va = sva; while (va < eva) { int found = 0; int svap; vm_offset_t nscan; /* * Page table page is not allocated. Skip it, we don't want to * force allocation of unnecessary PTE pages just to set the * protection. */ if (!*pmap_pde(pmap, va)) { /* XXX: avoid address wrap around */ nextpde: if (va >= i386_trunc_pdr((vm_offset_t) - 1)) break; va = i386_round_pdr(va + PAGE_SIZE); continue; } pte = ptp + i386_btop(va); if (*pte == 0) { /* * scan for a non-empty pte */ svap = pte - ptp; nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap; if (nscan + svap > evap) nscan = evap - svap; found = 0; if (nscan) asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" : "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx"); if (!found) goto nextpde; pte -= 1; svap = pte - ptp; va = i386_ptob(svap); } anyvalid++; i386prot = pte_prot(pmap, prot); if (va < UPT_MAX_ADDRESS) { i386prot |= PG_u; if (va >= UPT_MIN_ADDRESS) i386prot |= PG_RW; } pmap_pte_set_prot(pte, i386prot); va += PAGE_SIZE; } if (anyvalid) pmap_update(); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap, va, pa, prot, wired) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; vm_prot_t prot; boolean_t wired; { register pt_entry_t *pte; register pt_entry_t npte; vm_offset_t opa; int ptevalid = 0; if (pmap == NULL) return; - va = i386_trunc_page(va); - pa = i386_trunc_page(pa); + va = trunc_page(va); + pa = trunc_page(pa); if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); /* * Page Directory table entry not valid, we need a new PT page */ if (*pmap_pde(pmap, va) == 0) { printf("kernel page directory invalid pdir=%p, va=0x%lx\n", pmap->pm_pdir[PTDPTDI], va); panic("invalid kernel page directory"); } pte = pmap_pte(pmap, va); opa = pmap_pte_pa(pte); /* * Mapping has not changed, must be protection or wiring change. */ if (opa == pa) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { pmap_remove(pmap, va, va + PAGE_SIZE); } /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_is_managed(pa)) { register pv_entry_t pv, npv; int s; pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place * this entry after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V)); /* * When forking (copy-on-write, etc): A process will turn off write * permissions for any of its writable pages. If the data (object) is * only referred to by one process, the processes map is modified * directly as opposed to using the object manipulation routine. When * using pmap_protect, the modified bits are not kept in the vm_page_t * data structure. Therefore, when using pmap_enter in vm_fault to * bring back writability of a page, there has been no memory of the * modified or referenced bits except at the pte level. this clause * supports the carryover of the modified and used (referenced) bits. */ if (pa == opa) (int) npte |= (int) *pte & (PG_M | PG_U); if (wired) (int) npte |= PG_W; if (va < UPT_MIN_ADDRESS) (int) npte |= PG_u; else if (va < UPT_MAX_ADDRESS) (int) npte |= PG_u | PG_RW; if (*pte != npte) { if (*pte) ptevalid++; *pte = npte; } if (ptevalid) { - pmap_update(); + pmap_update_1pg(va); } else { pmap_use_pt(pmap, va); } } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. */ void pmap_qenter(va, m, count) vm_offset_t va; vm_page_t *m; int count; { int i; int anyvalid = 0; register pt_entry_t *pte; for (i = 0; i < count; i++) { - pte = vtopte(va + i * NBPG); - if (*pte) - anyvalid++; - *pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V)); + vm_offset_t tva = va + i * PAGE_SIZE; + pt_entry_t npte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V)); + pte = vtopte(tva); + if (*pte && (*pte != npte)) + pmap_update_1pg(tva); + *pte = npte; } - if (anyvalid) - pmap_update(); } /* * this routine jerks page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(va, count) vm_offset_t va; int count; { int i; register pt_entry_t *pte; for (i = 0; i < count; i++) { - pte = vtopte(va + i * NBPG); + vm_offset_t tva = va + i * PAGE_SIZE; + pte = vtopte(tva); *pte = 0; + pmap_update_1pg(tva); } - pmap_update(); } /* * add a wired page to the kva * note that in order for the mapping to take effect -- you * should do a pmap_update after doing the pmap_kenter... */ void pmap_kenter(va, pa) vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; int wasvalid = 0; pte = vtopte(va); if (*pte) wasvalid++; *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V)); if (wasvalid) - pmap_update(); + pmap_update_1pg(va); } /* * remove a page from the kernel pagetables */ void pmap_kremove(va) vm_offset_t va; { register pt_entry_t *pte; pte = vtopte(va); *pte = (pt_entry_t) 0; - pmap_update(); + pmap_update_1pg(va); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ -static inline void +static __inline void pmap_enter_quick(pmap, va, pa) register pmap_t pmap; vm_offset_t va; register vm_offset_t pa; { register pt_entry_t *pte; register pv_entry_t pv, npv; int s; /* * Enter on the PV list if part of our managed memory Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ pte = vtopte(va); /* a fault on the page table might occur here */ if (*pte) { pmap_remove(pmap, va, va + PAGE_SIZE); } pv = pa_to_pvh(pa); s = splhigh(); /* * No entries yet, use header as the first entry */ if (pv->pv_pmap == NULL) { pv->pv_pmap = pmap; pv->pv_va = va; pv->pv_next = NULL; } /* * There is at least one other VA mapping this page. Place this entry * after the header. */ else { npv = get_pv_entry(); npv->pv_va = va; npv->pv_pmap = pmap; npv->pv_next = pv->pv_next; pv->pv_next = npv; } splx(s); /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with desired protection/wiring. */ *pte = (pt_entry_t) ((int) (pa | PG_V | PG_u)); pmap_use_pt(pmap, va); return; } -#define MAX_INIT_PT (512 * 4096) +#define MAX_INIT_PT (512) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap, addr, object, pindex, size) pmap_t pmap; vm_offset_t addr; vm_object_t object; vm_pindex_t pindex; vm_size_t size; { vm_offset_t tmpidx; int psize; vm_page_t p; int objpgs; - if (!pmap || ((size > MAX_INIT_PT) && - (object->resident_page_count > MAX_INIT_PT / PAGE_SIZE))) { + psize = (size >> PAGE_SHIFT); + + if (!pmap || ((psize > MAX_INIT_PT) && + (object->resident_page_count > MAX_INIT_PT))) { return; } - psize = (size >> PAGE_SHIFT); /* * if we are processing a major portion of the object, then scan the * entire thing. */ if (psize > (object->size >> 2)) { objpgs = psize; for (p = object->memq.tqh_first; ((objpgs > 0) && (p != NULL)); p = p->listq.tqe_next) { tmpidx = p->pindex; if (tmpidx < pindex) { continue; } tmpidx -= pindex; if (tmpidx >= psize) { continue; } - if (((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) && - ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && - (p->bmapped == 0) && + if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { - if (p->flags & PG_CACHE) + if (p->queue == PQ_CACHE) vm_page_deactivate(p); vm_page_hold(p); p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + (tmpidx << PAGE_SHIFT), VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } objpgs -= 1; } } else { /* * else lookup the pages one-by-one. */ for (tmpidx = 0; tmpidx < psize; tmpidx += 1) { p = vm_page_lookup(object, tmpidx + pindex); - if (p && - ((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) && - (p->bmapped == 0) && - (p->busy == 0) && + if (p && (p->busy == 0) && ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { - if (p->flags & PG_CACHE) + if (p->queue == PQ_CACHE) vm_page_deactivate(p); vm_page_hold(p); p->flags |= PG_MAPPED; pmap_enter_quick(pmap, addr + (tmpidx << PAGE_SHIFT), VM_PAGE_TO_PHYS(p)); vm_page_unhold(p); } } } } /* + * pmap_prefault provides a quick way of clustering + * pagefaults into a processes address space. It is a "cousin" + * of pmap_object_init_pt, except it runs at page fault time instead + * of mmap time. + */ +#define PFBAK 2 +#define PFFOR 2 +#define PAGEORDER_SIZE (PFBAK+PFFOR) + +static int pmap_prefault_pageorder[] = { + -NBPG, NBPG, -2 * NBPG, 2 * NBPG +}; + +void +pmap_prefault(pmap, addra, entry, object) + pmap_t pmap; + vm_offset_t addra; + vm_map_entry_t entry; + vm_object_t object; +{ + int i; + vm_offset_t starta; + vm_offset_t addr; + vm_pindex_t pindex; + vm_page_t m; + int pageorder_index; + + if (entry->object.vm_object != object) + return; + + if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap)) + return; + + starta = addra - PFBAK * PAGE_SIZE; + if (starta < entry->start) { + starta = entry->start; + } else if (starta > addra) { + starta = 0; + } + + for (i = 0; i < PAGEORDER_SIZE; i++) { + vm_object_t lobject; + pt_entry_t *pte; + + addr = addra + pmap_prefault_pageorder[i]; + if (addr < starta || addr >= entry->end) + continue; + + pte = vtopte(addr); + if (*pte) + continue; + + pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; + lobject = object; + for (m = vm_page_lookup(lobject, pindex); + (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); + lobject = lobject->backing_object) { + if (lobject->backing_object_offset & (PAGE_MASK-1)) + break; + pindex += (lobject->backing_object_offset >> PAGE_SHIFT); + m = vm_page_lookup(lobject->backing_object, pindex); + } + + /* + * give-up when a page is not in memory + */ + if (m == NULL) + break; + + if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && + (m->busy == 0) && + (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { + + if (m->queue == PQ_CACHE) { + if (cnt.v_free_count + cnt.v_cache_count < + cnt.v_free_min) + break; + vm_page_deactivate(m); + } + vm_page_hold(m); + m->flags |= PG_MAPPED; + pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m)); + vm_page_unhold(m); + } + } +} + +/* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); - /* - * When unwiring, set the modified bit in the pte -- could have been - * changed by the kernel - */ - if (!wired) - (int) *pte |= PG_M; } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) pmap_t dst_pmap, src_pmap; vm_offset_t dst_addr; vm_size_t len; vm_offset_t src_addr; { } /* * Routine: pmap_kernel * Function: * Returns the physical map handle for the kernel. */ pmap_t pmap_kernel() { return (kernel_pmap); } /* * pmap_zero_page zeros the specified (machine independent) * page by mapping the page into virtual memory and using * bzero to clear its contents, one machine dependent page * at a time. */ void pmap_zero_page(phys) vm_offset_t phys; { if (*(int *) CMAP2) panic("pmap_zero_page: CMAP busy"); - *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys); - bzero(CADDR2, NBPG); + *(int *) CMAP2 = PG_V | PG_KW | trunc_page(phys); + bzero(CADDR2, PAGE_SIZE); *(int *) CMAP2 = 0; - pmap_update(); + pmap_update_1pg((vm_offset_t) CADDR2); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(src, dst) vm_offset_t src; vm_offset_t dst; { if (*(int *) CMAP1 || *(int *) CMAP2) panic("pmap_copy_page: CMAP busy"); - *(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src); - *(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst); + *(int *) CMAP1 = PG_V | PG_KW | trunc_page(src); + *(int *) CMAP2 = PG_V | PG_KW | trunc_page(dst); #if __GNUC__ > 1 - memcpy(CADDR2, CADDR1, NBPG); + memcpy(CADDR2, CADDR1, PAGE_SIZE); #else - bcopy(CADDR1, CADDR2, NBPG); + bcopy(CADDR1, CADDR2, PAGE_SIZE); #endif *(int *) CMAP1 = 0; *(int *) CMAP2 = 0; - pmap_update(); + pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2); } /* * Routine: pmap_pageable * Function: * Make the specified pages (by pmap, offset) * pageable (or not) as requested. * * A page which is not pageable may not take * a fault; therefore, its page table entry * must remain valid for the duration. * * This routine is merely advisory; pmap_enter * will specify that these pages are to be wired * down (or not) as appropriate. */ void pmap_pageable(pmap, sva, eva, pageable) pmap_t pmap; vm_offset_t sva, eva; boolean_t pageable; { } /* * this routine returns true if a physical page resides * in the given pmap. */ boolean_t pmap_page_exists(pmap, pa) pmap_t pmap; vm_offset_t pa; { register pv_entry_t pv; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * pmap_testbit tests bits in pte's * note that the testbit/changebit routines are inline, * and a lot of things compile-time evaluate. */ static __inline boolean_t pmap_testbit(pa, bit) register vm_offset_t pa; int bit; { register pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_is_managed(pa)) return FALSE; pv = pa_to_pvh(pa); s = splhigh(); /* * Not found, check current mappings returning immediately if found. */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { /* * if the bit being tested is the modified bit, then * mark UPAGES as always modified, and ptes as never * modified. */ - if (bit & PG_U) { + if (bit & (PG_U|PG_M)) { if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) { continue; } } - if (bit & PG_M) { - if (pv->pv_va >= USRSTACK) { - if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) { - continue; - } - if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) { - splx(s); - return TRUE; - } else if (pv->pv_va < KERNBASE) { - splx(s); - return FALSE; - } - } - } if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); continue; } pte = pmap_pte(pv->pv_pmap, pv->pv_va); if ((int) *pte & bit) { splx(s); return TRUE; } } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(pa, bit, setem) vm_offset_t pa; int bit; boolean_t setem; { register pv_entry_t pv; register pt_entry_t *pte, npte; vm_offset_t va; + int changed; int s; if (!pmap_is_managed(pa)) return; pv = pa_to_pvh(pa); s = splhigh(); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ if (pv->pv_pmap != NULL) { for (; pv; pv = pv->pv_next) { va = pv->pv_va; /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (va >= clean_sva && va < clean_eva) continue; } if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%lx\n", va); continue; } pte = pmap_pte(pv->pv_pmap, va); - if (setem) + if (setem) { (int) npte = (int) *pte | bit; - else + } else { (int) npte = (int) *pte & ~bit; + } *pte = npte; } } splx(s); - pmap_update(); + if (curproc != pageproc) + pmap_update(); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(phys, prot) vm_offset_t phys; vm_prot_t prot; { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) pmap_changebit(phys, PG_RW, FALSE); else pmap_remove_all(phys); } } vm_offset_t pmap_phys_address(ppn) int ppn; { return (i386_ptob(ppn)); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * by any physical maps. */ boolean_t pmap_is_referenced(vm_offset_t pa) { return pmap_testbit((pa), PG_U); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_offset_t pa) { return pmap_testbit((pa), PG_M); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_offset_t pa) { pmap_changebit((pa), PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_offset_t pa) { pmap_changebit((pa), PG_U, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. The non-cacheable bits are set on each * mapped page. */ void * pmap_mapdev(pa, size) vm_offset_t pa; vm_size_t size; { vm_offset_t va, tmpva; pt_entry_t *pte; pa = trunc_page(pa); size = roundup(size, PAGE_SIZE); va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); for (tmpva = va; size > 0;) { pte = vtopte(tmpva); *pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N)); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_update(); return ((void *) va); } + +#ifdef PMAP_DEBUG +pmap_pid_dump(int pid) { + pmap_t pmap; + struct proc *p; + int npte = 0; + int index; + for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { + if (p->p_pid != pid) + continue; + + if (p->p_vmspace) { + int i,j; + index = 0; + pmap = &p->p_vmspace->vm_pmap; + for(i=0;i<1024;i++) { + pd_entry_t *pde; + pt_entry_t *pte; + unsigned base = i << PD_SHIFT; + + pde = &pmap->pm_pdir[i]; + if (pde && pmap_pde_v(pde)) { + for(j=0;j<1024;j++) { + unsigned va = base + (j << PG_SHIFT); + if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + return npte; + } + pte = pmap_pte( pmap, va); + if (pte && pmap_pte_v(pte)) { + vm_offset_t pa; + vm_page_t m; + pa = *(int *)pte; + m = PHYS_TO_VM_PAGE((pa & PG_FRAME)); + printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", + va, pa, m->hold_count, m->wire_count, m->flags); + npte++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + } + } + return npte; +} +#endif #ifdef DEBUG static void pads __P((pmap_t pm)); static void pmap_pvdump __P((vm_offset_t pa)); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { unsigned va, i, j; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < 1024; i++) if (pm->pm_pdir[i]) for (j = 0; j < 1024; j++) { va = (i << PD_SHIFT) + (j << PG_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *(int *) ptep); }; } static void pmap_pvdump(pa) vm_offset_t pa; { register pv_entry_t pv; printf("pa %x", pa); for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) { #ifdef used_to_be printf(" -> pmap %x, va %x, flags %x", pv->pv_pmap, pv->pv_va, pv->pv_flags); #endif printf(" -> pmap %x, va %x", pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 13489) +++ head/sys/i386/i386/trap.c (revision 13490) @@ -1,1061 +1,1062 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $ + * $Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $ */ /* * 386 Trap and System call handling */ #include "opt_ktrace.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef POWERFAIL_NMI # include # include #endif #include "isa.h" #include "npx.h" int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void linux_syscall __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int)); static void trap_fatal __P((struct trapframe *)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 27 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ }; static void userret __P((struct proc *p, struct trapframe *frame, u_quad_t oticks)); static inline void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig, s; while ((sig = CURSIG(p)) != 0) postsig(sig); p->p_priority = p->p_usrpri; if (want_resched) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ s = splclock(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); splx(s); while ((sig = CURSIG(p)) != 0) postsig(sig); } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { u_quad_t ticks = p->p_sticks - oticks; if (ticks) { #ifdef PROFTIMER extern int profscale; addupc(frame->tf_eip, &p->p_stats->p_prof, ticks * profscale); #else addupc(frame->tf_eip, &p->p_stats->p_prof, ticks); #endif } } curpriority = p->p_priority; } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; #ifdef DEBUG u_long eva; #endif type = frame.tf_trapno; code = frame.tf_err; if (ISPL(frame.tf_cs) == SEL_UPL) { /* user trap */ sticks = p->p_sticks; p->p_md.md_regs = (int *)&frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ astoff(); cnt.v_soft++; if (p->p_flag & P_OWEUPC) { addupc(frame.tf_eip, &p->p_stats->p_prof, 1); p->p_flag &= ~P_OWEUPC; } goto out; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ case T_STKFLT: /* stack fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE); if (i == -1) return; if (i == 0) goto out; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV_TRAP; i = SIGFPE; break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI goto handle_powerfail; #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; panic("NMI indicates hardware failure"); #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF_TRAP; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_SUBRNG_TRAP; i = SIGFPE; break; case T_DNA: #if NNPX > 0 /* if a transparent fault (due to context switch "late") */ if (npxdna()) return; #endif /* NNPX > 0 */ if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) return; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE); return; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ #define MAYBE_DORETI_FAULT(where, whereto) \ do { \ if (frame.tf_eip == (int)where) { \ frame.tf_eip = (int)whereto; \ return; \ } \ } while (0) if (intr_nesting_level == 0) { MAYBE_DORETI_FAULT(doreti_iret, doreti_iret_fault); MAYBE_DORETI_FAULT(doreti_popl_ds, doreti_popl_ds_fault); MAYBE_DORETI_FAULT(doreti_popl_es, doreti_popl_es_fault); } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ return; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; return; } /* * Fall through. */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB if (kdb_trap (type, 0, &frame)) return; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif handle_powerfail: { static unsigned lastalert = 0; if(time.tv_sec - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time.tv_sec; } return; } #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame); return; } trapsignal(p, i, ucode); #ifdef DEBUG eva = rcr2(); if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%x", eva); uprintf("\n"); } #endif out: userret(p, &frame, sticks); } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t ptepg; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (curpcb == NULL || curpcb->pcb_onfault == NULL))) { trap_fatal(frame); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { vm_offset_t v; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) - (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); + (void) vm_fault(map, + trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame) struct trapframe *frame; { int code, type, eva; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; eva = rcr2(); sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace/trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } printf("interrupt mask = "); if ((cpl & net_imask) == net_imask) printf("net "); if ((cpl & tty_imask) == tty_imask) printf("tty "); if ((cpl & bio_imask) == bio_imask) printf("bio "); if (cpl == 0) printf("none"); printf("\n"); #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if (kdb_trap (type, 0, frame)) return; #endif if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { struct pcb *pcb = curpcb; if (pcb != NULL) { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip); printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp); printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp); } panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va, v; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; ++p->p_lock; if ((caddr_t)va >= vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { --p->p_lock; return (1); } } v = trunc_page(vtopte(va)); /* * wire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE); /* * unwire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE); } --p->p_lock; if (rv != KERN_SUCCESS) return 1; return (0); } /* * System call request from POSIX system call gate interface to kernel. * Like trap(), argument is call by reference. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int args[8], rval[2]; u_int code; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("syscall"); p->p_md.md_regs = (int *)&frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; if ((i = callp->sy_narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif goto bad; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif rval[0] = 0; rval[1] = frame.tf_edx; error = (*callp->sy_call)(p, args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_edx = rval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes. */ frame.tf_eip -= 7; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #if defined(COMPAT_LINUX) || defined(LINUX) void linux_syscall(frame) struct trapframe frame; { struct proc *p = curproc; struct sysent *callp; u_quad_t sticks; int error; int rval[2]; u_int code; struct linux_syscall_args { int arg1; int arg2; int arg3; int arg4; int arg5; } args; args.arg1 = frame.tf_ebx; args.arg2 = frame.tf_ecx; args.arg3 = frame.tf_edx; args.arg4 = frame.tf_esi; args.arg5 = frame.tf_edi; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("linux syscall"); p->p_md.md_regs = (int *)&frame; code = frame.tf_eax; if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args); #endif rval[0] = 0; error = (*callp->sy_call)(p, &args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* Reconstruct pc, subtract size of int 0x80 */ frame.tf_eip -= 2; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = -error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #endif /* COMPAT_LINUX || LINUX */ Index: head/sys/i386/i386/vm_machdep.c =================================================================== --- head/sys/i386/i386/vm_machdep.c (revision 13489) +++ head/sys/i386/i386/vm_machdep.c (revision 13490) @@ -1,871 +1,871 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ - * $Id: vm_machdep.c,v 1.49 1995/12/14 08:31:01 phk Exp $ + * $Id: vm_machdep.c,v 1.50 1996/01/05 20:12:23 wollman Exp $ */ #include "npx.h" #include "opt_bounce.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vm_fault_quick __P((caddr_t v, int prot)); #ifdef BOUNCE_BUFFERS static vm_offset_t vm_bounce_kva __P((int size, int waitok)); static void vm_bounce_kva_free __P((vm_offset_t addr, vm_offset_t size, int now)); static vm_offset_t vm_bounce_page_find __P((int count)); static void vm_bounce_page_free __P((vm_offset_t pa, int count)); static volatile int kvasfreecnt; caddr_t bouncememory; int bouncepages; static int bpwait; static vm_offset_t *bouncepa; static int bmwait, bmfreeing; #define BITS_IN_UNSIGNED (8*sizeof(unsigned)) static int bounceallocarraysize; static unsigned *bounceallocarray; static int bouncefree; #define SIXTEENMEG (4096*4096) #define MAXBKVA 1024 int maxbkva = MAXBKVA*NBPG; /* special list that can be used at interrupt time for eventual kva free */ static struct kvasfree { vm_offset_t addr; vm_offset_t size; } kvaf[MAXBKVA]; /* * get bounce buffer pages (count physically contiguous) * (only 1 inplemented now) */ static vm_offset_t vm_bounce_page_find(count) int count; { int bit; int s,i; if (count != 1) panic("vm_bounce_page_find -- no support for > 1 page yet!!!"); s = splbio(); retry: for (i = 0; i < bounceallocarraysize; i++) { if (bounceallocarray[i] != 0xffffffff) { bit = ffs(~bounceallocarray[i]); if (bit) { bounceallocarray[i] |= 1 << (bit - 1) ; bouncefree -= count; splx(s); return bouncepa[(i * BITS_IN_UNSIGNED + (bit - 1))]; } } } bpwait = 1; tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0); goto retry; } static void vm_bounce_kva_free(addr, size, now) vm_offset_t addr; vm_offset_t size; int now; { int s = splbio(); kvaf[kvasfreecnt].addr = addr; kvaf[kvasfreecnt].size = size; ++kvasfreecnt; if( now) { /* * this will do wakeups */ vm_bounce_kva(0,0); } else { if (bmwait) { /* * if anyone is waiting on the bounce-map, then wakeup */ wakeup((caddr_t) io_map); bmwait = 0; } } splx(s); } /* * free count bounce buffer pages */ static void vm_bounce_page_free(pa, count) vm_offset_t pa; int count; { int allocindex; int index; int bit; if (count != 1) panic("vm_bounce_page_free -- no support for > 1 page yet!!!"); for(index=0;indexb_flags & B_BOUNCE) { printf("vm_bounce_alloc: called recursively???\n"); return; } if (bp->b_bufsize < bp->b_bcount) { printf( "vm_bounce_alloc: b_bufsize(0x%lx) < b_bcount(0x%lx) !!\n", bp->b_bufsize, bp->b_bcount); panic("vm_bounce_alloc"); } /* * This is not really necessary * if( bp->b_bufsize != bp->b_bcount) { * printf("size: %d, count: %d\n", bp->b_bufsize, bp->b_bcount); * } */ vastart = (vm_offset_t) bp->b_data; vaend = (vm_offset_t) bp->b_data + bp->b_bufsize; - vapstart = i386_trunc_page(vastart); - vapend = i386_round_page(vaend); + vapstart = trunc_page(vastart); + vapend = round_page(vaend); countvmpg = (vapend - vapstart) / NBPG; /* * if any page is above 16MB, then go into bounce-buffer mode */ va = vapstart; for (i = 0; i < countvmpg; i++) { pa = pmap_kextract(va); if (pa >= SIXTEENMEG) ++dobounceflag; if( pa == 0) panic("vm_bounce_alloc: Unmapped page"); va += NBPG; } if (dobounceflag == 0) return; if (bouncepages < dobounceflag) panic("Not enough bounce buffers!!!"); /* * allocate a replacement kva for b_addr */ kva = vm_bounce_kva(countvmpg*NBPG, 1); #if 0 printf("%s: vapstart: %x, vapend: %x, countvmpg: %d, kva: %x ", (bp->b_flags & B_READ) ? "read":"write", vapstart, vapend, countvmpg, kva); #endif va = vapstart; for (i = 0; i < countvmpg; i++) { pa = pmap_kextract(va); if (pa >= SIXTEENMEG) { /* * allocate a replacement page */ vm_offset_t bpa = vm_bounce_page_find(1); pmap_kenter(kva + (NBPG * i), bpa); #if 0 printf("r(%d): (%x,%x,%x) ", i, va, pa, bpa); #endif /* * if we are writing, the copy the data into the page */ if ((bp->b_flags & B_READ) == 0) { bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG); } } else { /* * use original page */ pmap_kenter(kva + (NBPG * i), pa); } va += NBPG; } /* * flag the buffer as being bounced */ bp->b_flags |= B_BOUNCE; /* * save the original buffer kva */ bp->b_savekva = bp->b_data; /* * put our new kva into the buffer (offset by original offset) */ bp->b_data = (caddr_t) (((vm_offset_t) kva) | ((vm_offset_t) bp->b_savekva & (NBPG - 1))); #if 0 printf("b_savekva: %x, newva: %x\n", bp->b_savekva, bp->b_data); #endif return; } /* * hook into biodone to free bounce buffer */ void vm_bounce_free(bp) struct buf *bp; { int i; vm_offset_t origkva, bouncekva, bouncekvaend; /* * if this isn't a bounced buffer, then just return */ if ((bp->b_flags & B_BOUNCE) == 0) return; /* * This check is not necessary * if (bp->b_bufsize != bp->b_bcount) { * printf("vm_bounce_free: b_bufsize=%d, b_bcount=%d\n", * bp->b_bufsize, bp->b_bcount); * } */ origkva = (vm_offset_t) bp->b_savekva; bouncekva = (vm_offset_t) bp->b_data; /* printf("free: %d ", bp->b_bufsize); */ /* * check every page in the kva space for b_addr */ for (i = 0; i < bp->b_bufsize; ) { vm_offset_t mybouncepa; vm_offset_t copycount; - copycount = i386_round_page(bouncekva + 1) - bouncekva; - mybouncepa = pmap_kextract(i386_trunc_page(bouncekva)); + copycount = round_page(bouncekva + 1) - bouncekva; + mybouncepa = pmap_kextract(trunc_page(bouncekva)); /* * if this is a bounced pa, then process as one */ - if ( mybouncepa != pmap_kextract( i386_trunc_page( origkva))) { + if ( mybouncepa != pmap_kextract( trunc_page( origkva))) { vm_offset_t tocopy = copycount; if (i + tocopy > bp->b_bufsize) tocopy = bp->b_bufsize - i; /* * if this is a read, then copy from bounce buffer into original buffer */ if (bp->b_flags & B_READ) bcopy((caddr_t) bouncekva, (caddr_t) origkva, tocopy); /* * free the bounce allocation */ /* printf("(kva: %x, pa: %x)", bouncekva, mybouncepa); */ vm_bounce_page_free(mybouncepa, 1); } origkva += copycount; bouncekva += copycount; i += copycount; } /* printf("\n"); */ /* * add the old kva into the "to free" list */ - bouncekva= i386_trunc_page((vm_offset_t) bp->b_data); - bouncekvaend= i386_round_page((vm_offset_t)bp->b_data + bp->b_bufsize); + bouncekva= trunc_page((vm_offset_t) bp->b_data); + bouncekvaend= round_page((vm_offset_t)bp->b_data + bp->b_bufsize); /* printf("freeva: %d\n", (bouncekvaend - bouncekva) / NBPG); */ vm_bounce_kva_free( bouncekva, (bouncekvaend - bouncekva), 0); bp->b_data = bp->b_savekva; bp->b_savekva = 0; bp->b_flags &= ~B_BOUNCE; return; } /* * init the bounce buffer system */ void vm_bounce_init() { int i; kvasfreecnt = 0; if (bouncepages == 0) return; bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED; bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT); if (!bounceallocarray) panic("Cannot allocate bounce resource array"); bouncepa = malloc(bouncepages * sizeof(vm_offset_t), M_TEMP, M_NOWAIT); if (!bouncepa) panic("Cannot allocate physical memory array"); for(i=0;i= SIXTEENMEG) panic("bounce memory out of range"); if( pa == 0) panic("bounce memory not resident"); bouncepa[i] = pa; bounceallocarray[i/(8*sizeof(int))] &= ~(1<<(i%(8*sizeof(int)))); } bouncefree = bouncepages; } #endif /* BOUNCE_BUFFERS */ /* * quick version of vm_fault */ static void vm_fault_quick(v, prot) caddr_t v; int prot; { if (prot & VM_PROT_WRITE) subyte(v, fubyte(v)); else fubyte(v); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the kernel stack and pcb, making the child * ready to run, and marking it so that it can return differently * than the parent. Returns 1 in the child process, 0 in the parent. * We currently double-map the user area so that the stack is at the same * address in each process; in the future we will probably relocate * the frame pointers on the stack after copying. */ int cpu_fork(p1, p2) register struct proc *p1, *p2; { register struct user *up = p2->p_addr; int offset; /* * Copy pcb and stack from proc p1 to p2. * We do this as cheaply as possible, copying only the active * part of the stack. The stack and pcb need to agree; * this is tricky, as the final pcb is constructed by savectx, * but its frame isn't yet on the stack when the stack is copied. * swtch compensates for this when the child eventually runs. * This should be done differently, with a single call * that copies and updates the pcb+stack, * replacing the bcopy and savectx. */ p2->p_addr->u_pcb = p1->p_addr->u_pcb; offset = mvesp() - (int)kstack; bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset, (unsigned) ctob(UPAGES) - offset); p2->p_md.md_regs = p1->p_md.md_regs; pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb); /* * * Arrange for a non-local goto when the new process * is started, to resume here, returning nonzero from setjmp. */ if (savectx(&up->u_pcb, 1)) { /* * Return 1 in child. */ return (1); } return (0); } void cpu_exit(p) register struct proc *p; { #if NNPX > 0 npxexit(p); #endif /* NNPX */ cnt.v_swtch++; cpu_switch(p); panic("cpu_exit"); } void -cpu_wait(p) struct proc *p; { -/* extern vm_map_t upages_map; */ - +cpu_wait(p) + struct proc *p; +{ /* drop per-process resources */ - pmap_remove(vm_map_pmap(u_map), (vm_offset_t) p->p_addr, - ((vm_offset_t) p->p_addr) + ctob(UPAGES)); + pmap_qremove((vm_offset_t) p->p_addr, UPAGES); kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); vmspace_free(p->p_vmspace); } /* * Dump the machine specific header information at the start of a core dump. */ int cpu_coredump(p, vp, cred) struct proc *p; struct vnode *vp; struct ucred *cred; { return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p)); } #ifdef notyet static void setredzone(pte, vaddr) u_short *pte; caddr_t vaddr; { /* eventually do this by setting up an expand-down stack segment for ss0: selector, allowing stack access down to top of u. this means though that protection violations need to be handled thru a double fault exception that must do an integral task switch to a known good context, within which a dump can be taken. a sensible scheme might be to save the initial context used by sched (that has physical memory mapped 1:1 at bottom) and take the dump while still in mapped mode */ } #endif /* * Convert kernel VA to physical address */ u_long kvtop(void *addr) { vm_offset_t va; va = pmap_kextract((vm_offset_t)addr); if (va == 0) panic("kvtop: zero page frame"); return((int)va); } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. */ void vmapbuf(bp) register struct buf *bp; { register int npf; register caddr_t addr; int off; vm_offset_t kva; vm_offset_t pa; if ((bp->b_flags & B_PHYS) == 0) panic("vmapbuf"); /* * this is the kva that is to be used for * the temporary kernel mapping */ kva = (vm_offset_t) bp->b_saveaddr; for (addr = (caddr_t)trunc_page(bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += PAGE_SIZE) { /* * do the vm_fault if needed, do the copy-on-write thing when * reading stuff off device into memory. */ vm_fault_quick(addr, (bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); pa = pmap_kextract((vm_offset_t) addr); if (pa == 0) panic("vmapbuf: page not present"); /* * hold the data page */ #ifdef DIAGNOSTIC if( VM_PAGE_TO_PHYS(PHYS_TO_VM_PAGE(pa)) != pa) panic("vmapbuf: confused PHYS_TO_VM_PAGE mapping"); #endif vm_page_hold(PHYS_TO_VM_PAGE(pa)); } addr = bp->b_saveaddr = bp->b_data; off = (int)addr & PGOFSET; npf = btoc(round_page(bp->b_bufsize + off)); bp->b_data = (caddr_t) (kva + off); while (npf--) { pa = pmap_kextract((vm_offset_t)addr); if (pa == 0) panic("vmapbuf: null page frame"); pmap_kenter(kva, trunc_page(pa)); addr += PAGE_SIZE; kva += PAGE_SIZE; } } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(bp) register struct buf *bp; { register caddr_t addr; vm_offset_t pa; if ((bp->b_flags & B_PHYS) == 0) panic("vunmapbuf"); for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += NBPG) pmap_kremove((vm_offset_t) addr); bp->b_data = bp->b_saveaddr; bp->b_saveaddr = NULL; /* * unhold the pde, and data pages */ for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data); addr < bp->b_data + bp->b_bufsize; addr += NBPG) { /* * release the data page */ pa = pmap_kextract((vm_offset_t) addr); vm_page_unhold(PHYS_TO_VM_PAGE(pa)); } } /* * Force reset the processor by invalidating the entire address space! */ void cpu_reset() { /* * Attempt to do a CPU reset via the keyboard controller, * do not turn of the GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ #ifndef BROKEN_KEYBOARD_RESET outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ printf("Keyboard reset did not work, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ #endif /* force a shutdown by unmapping entire address space ! */ bzero((caddr_t) PTD, NBPG); /* "good night, sweet prince .... " */ pmap_update(); /* NOTREACHED */ while(1); } /* * Grow the user stack to allow for 'sp'. This version grows the stack in * chunks of SGROWSIZ. */ int grow(p, sp) struct proc *p; u_int sp; { unsigned int nss; caddr_t v; struct vmspace *vm = p->p_vmspace; if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK) return (1); nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE); if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur) return (0); if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT, SGROWSIZ) < nss) { int grow_amount; /* * If necessary, grow the VM that the stack occupies * to allow for the rlimit. This allows us to not have * to allocate all of the VM up-front in execve (which * is expensive). * Grow the VM by the amount requested rounded up to * the nearest SGROWSIZ to provide for some hysteresis. */ grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ); v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT, SGROWSIZ) - grow_amount; /* * If there isn't enough room to extend by SGROWSIZ, then * just extend to the maximum size */ if (v < vm->vm_maxsaddr) { v = vm->vm_maxsaddr; grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT); } if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v, - grow_amount, FALSE) != KERN_SUCCESS)) { + grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) { return (0); } vm->vm_ssize += grow_amount >> PAGE_SHIFT; } return (1); } /* * prototype routine to implement the pre-zeroed page mechanism * this routine is called from the idle loop. */ int vm_page_zero_idle() { vm_page_t m; if ((cnt.v_free_count > cnt.v_interrupt_free_min) && (m = vm_page_queue_free.tqh_first)) { TAILQ_REMOVE(&vm_page_queue_free, m, pageq); enable_intr(); pmap_zero_page(VM_PAGE_TO_PHYS(m)); disable_intr(); TAILQ_INSERT_HEAD(&vm_page_queue_zero, m, pageq); + m->queue = PQ_ZERO; ++vm_page_zero_count; return 1; } return 0; } Index: head/sys/kern/imgact_aout.c =================================================================== --- head/sys/kern/imgact_aout.c (revision 13489) +++ head/sys/kern/imgact_aout.c (revision 13490) @@ -1,218 +1,211 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by David Greenman * 4. The name of the developer may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: imgact_aout.c,v 1.20 1995/12/11 04:56:00 dyson Exp $ + * $Id: imgact_aout.c,v 1.21 1995/12/15 02:57:40 peter Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int exec_aout_imgact __P((struct image_params *imgp)); static int exec_aout_imgact(imgp) struct image_params *imgp; { struct exec *a_out = (struct exec *) imgp->image_header; struct vmspace *vmspace = imgp->proc->p_vmspace; unsigned long vmaddr, virtual_offset; unsigned long file_offset; unsigned long bss_size; int error; #if defined(COMPAT_LINUX) || defined(LINUX) /* * Linux and *BSD binaries look very much alike, * only the machine id is different: * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. */ if (((a_out->a_magic >> 16) & 0xff) != 0x86 && ((a_out->a_magic >> 16) & 0xff) != 0) return -1; #endif /* COMPAT_LINUX || defined(LINUX) */ /* * Set file/virtual offset based on a.out variant. * We do two cases: host byte order and network byte order * (for NetBSD compatibility) */ switch ((int)(a_out->a_magic & 0xffff)) { case ZMAGIC: virtual_offset = 0; if (a_out->a_text) { file_offset = NBPG; } else { /* Bill's "screwball mode" */ file_offset = 0; } break; case QMAGIC: virtual_offset = NBPG; file_offset = 0; break; default: /* NetBSD compatibility */ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: virtual_offset = NBPG; file_offset = 0; break; default: return (-1); } } bss_size = roundup(a_out->a_bss, NBPG); /* * Check various fields in header for validity/bounds. */ if (/* entry point must lay with text region */ a_out->a_entry < virtual_offset || a_out->a_entry >= virtual_offset + a_out->a_text || /* text and data size must each be page rounded */ a_out->a_text % NBPG || a_out->a_data % NBPG) return (-1); /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > imgp->attr->va_size) return (EFAULT); /* * text/data/bss must not exceed limits */ if (/* text can't exceed maximum text size */ a_out->a_text > MAXTSIZ || /* data + bss can't exceed maximum data size */ a_out->a_data + bss_size > MAXDSIZ || /* data + bss can't exceed rlimit */ a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur) return (ENOMEM); /* copy in arguments and/or environment from old process */ error = exec_extract_strings(imgp); if (error) return (error); /* * Destroy old process VM and create a new one (with a new stack) */ exec_new_vmspace(imgp); /* - * Map text read/execute + * Map text/data read/execute */ vmaddr = virtual_offset; error = vm_mmap(&vmspace->vm_map, /* map */ &vmaddr, /* address */ - a_out->a_text, /* size */ + a_out->a_text + a_out->a_data, /* size */ VM_PROT_READ | VM_PROT_EXECUTE, /* protection */ - VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE, /* max protection */ + VM_PROT_ALL, /* max protection */ MAP_PRIVATE | MAP_FIXED, /* flags */ (caddr_t)imgp->vp, /* vnode */ file_offset); /* offset */ if (error) return (error); /* - * Map data read/write (if text is 0, assume text is in data area - * [Bill's screwball mode]) + * allow writing of data */ - vmaddr = virtual_offset + a_out->a_text; - error = - vm_mmap(&vmspace->vm_map, - &vmaddr, - a_out->a_data, - VM_PROT_READ | VM_PROT_WRITE | (a_out->a_text ? 0 : VM_PROT_EXECUTE), - VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, - (caddr_t) imgp->vp, - file_offset + a_out->a_text); - if (error) - return (error); + vm_map_protect(&vmspace->vm_map, + vmaddr + a_out->a_text, + vmaddr + a_out->a_text + a_out->a_data, + VM_PROT_ALL, + FALSE); if (bss_size != 0) { /* * Allocate demand-zeroed area for uninitialized data * "bss" = 'block started by symbol' - named after the IBM 7090 * instruction of the same name. */ vmaddr = virtual_offset + a_out->a_text + a_out->a_data; - error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE); + error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return (error); } /* Fill in process VM information */ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) virtual_offset; vmspace->vm_daddr = (caddr_t) virtual_offset + a_out->a_text; /* Fill in image_params */ imgp->interpreted = 0; imgp->entry_addr = a_out->a_entry; imgp->proc->p_sysent = &aout_sysvec; /* Indicate that this file should not be modified */ imgp->vp->v_flag |= VTEXT; return (0); } /* * Tell kern_execve.c about it, with a little help from the linker. * Since `const' objects end up in the text segment, TEXT_SET is the * correct directive to use. */ static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; TEXT_SET(execsw_set, aout_execsw); Index: head/sys/kern/imgact_gzip.c =================================================================== --- head/sys/kern/imgact_gzip.c (revision 13489) +++ head/sys/kern/imgact_gzip.c (revision 13490) @@ -1,379 +1,379 @@ /* * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * - * $Id: imgact_gzip.c,v 1.16 1995/12/02 16:32:01 bde Exp $ + * $Id: imgact_gzip.c,v 1.17 1995/12/07 12:46:35 davidg Exp $ * * This module handles execution of a.out files which have been run through * "gzip". This saves diskspace, but wastes cpu-cycles and VM. * * TODO: * text-segments should be made R/O after being filled * is the vm-stuff safe ? * should handle the entire header of gzip'ed stuff. * inflate isn't quite reentrant yet... * error-handling is a mess... * so is the rest... * tidy up unnecesary includes */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct imgact_gzip { struct image_params *ip; struct exec a_out; int error; int where; u_char *inbuf; u_long offset; u_long output; u_long len; int idx; u_long virtual_offset, file_offset, file_end, bss_size; }; static int exec_gzip_imgact __P((struct image_params *imgp)); static int NextByte __P((void *vp)); static int do_aout_hdr __P((struct imgact_gzip *)); static int Flush __P((void *vp, u_char *, u_long siz)); static int exec_gzip_imgact(imgp) struct image_params *imgp; { int error, error2 = 0; u_char *p = (u_char *) imgp->image_header; struct imgact_gzip igz; struct inflate infl; /* If these four are not OK, it isn't a gzip file */ if (p[0] != 0x1f) return -1; /* 0 Simply magic */ if (p[1] != 0x8b) return -1; /* 1 Simply magic */ if (p[2] != 0x08) return -1; /* 2 Compression method */ if (p[9] != 0x03) return -1; /* 9 OS compressed on */ /* * If this one contains anything but a comment or a filename marker, * we don't want to chew on it */ if (p[3] & ~(0x18)) return ENOEXEC; /* 3 Flags */ /* These are of no use to us */ /* 4-7 Timestamp */ /* 8 Extra flags */ bzero(&igz, sizeof igz); bzero(&infl, sizeof infl); infl.gz_private = (void *) &igz; infl.gz_input = NextByte; infl.gz_output = Flush; igz.ip = imgp; igz.idx = 10; if (p[3] & 0x08) { /* skip a filename */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } if (p[3] & 0x10) { /* skip a comment */ while (p[igz.idx++]) if (igz.idx >= PAGE_SIZE) return ENOEXEC; } igz.len = igz.ip->attr->va_size; error = inflate(&infl); if (igz.inbuf) { error2 = vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, (vm_offset_t) igz.inbuf + PAGE_SIZE); } if (igz.error || error || error2) { printf("Output=%lu ", igz.output); printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", error, igz.error, error2, igz.where); } if (igz.error) return igz.error; if (error) return ENOEXEC; if (error2) return error2; return 0; } static int do_aout_hdr(struct imgact_gzip * gz) { int error; struct vmspace *vmspace = gz->ip->proc->p_vmspace; u_long vmaddr; /* * Set file/virtual offset based on a.out variant. We do two cases: * host byte order and network byte order (for NetBSD compatibility) */ switch ((int) (gz->a_out.a_magic & 0xffff)) { case ZMAGIC: gz->virtual_offset = 0; if (gz->a_out.a_text) { gz->file_offset = NBPG; } else { /* Bill's "screwball mode" */ gz->file_offset = 0; } break; case QMAGIC: gz->virtual_offset = NBPG; gz->file_offset = 0; break; default: /* NetBSD compatibility */ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { case ZMAGIC: case QMAGIC: gz->virtual_offset = NBPG; gz->file_offset = 0; break; default: gz->where = __LINE__; return (-1); } } gz->bss_size = roundup(gz->a_out.a_bss, NBPG); /* * Check various fields in header for validity/bounds. */ if ( /* entry point must lay with text region */ gz->a_out.a_entry < gz->virtual_offset || gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || /* text and data size must each be page rounded */ gz->a_out.a_text % NBPG || gz->a_out.a_data % NBPG) { gz->where = __LINE__; return (-1); } /* * text/data/bss must not exceed limits */ if ( /* text can't exceed maximum text size */ gz->a_out.a_text > MAXTSIZ || /* data + bss can't exceed maximum data size */ gz->a_out.a_data + gz->bss_size > MAXDSIZ || /* data + bss can't exceed rlimit */ gz->a_out.a_data + gz->bss_size > gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) { gz->where = __LINE__; return (ENOMEM); } /* Find out how far we should go */ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; /* copy in arguments and/or environment from old process */ error = exec_extract_strings(gz->ip); if (error) { gz->where = __LINE__; return (error); } /* * Destroy old process VM and create a new one (with a new stack) */ exec_new_vmspace(gz->ip); vmaddr = gz->virtual_offset; error = vm_mmap(&vmspace->vm_map, /* map */ &vmaddr,/* address */ gz->a_out.a_text, /* size */ VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE, /* protection */ VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE, MAP_ANON | MAP_FIXED, /* flags */ 0, /* vnode */ 0); /* offset */ if (error) { gz->where = __LINE__; return (error); } vmaddr = gz->virtual_offset + gz->a_out.a_text; /* * Map data read/write (if text is 0, assume text is in data area * [Bill's screwball mode]) */ error = vm_mmap(&vmspace->vm_map, &vmaddr, gz->a_out.a_data, VM_PROT_READ | VM_PROT_WRITE | (gz->a_out.a_text ? 0 : VM_PROT_EXECUTE), VM_PROT_ALL, MAP_ANON | MAP_FIXED, 0, 0); if (error) { gz->where = __LINE__; return (error); } if (gz->bss_size != 0) { /* * Allocate demand-zeroed area for uninitialized data "bss" = 'block * started by symbol' - named after the IBM 7090 instruction of the * same name. */ vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data; - error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE); + error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { gz->where = __LINE__; return (error); } } /* Fill in process VM information */ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t) gz->virtual_offset; vmspace->vm_daddr = (caddr_t) gz->virtual_offset + gz->a_out.a_text; /* Fill in image_params */ gz->ip->interpreted = 0; gz->ip->entry_addr = gz->a_out.a_entry; gz->ip->proc->p_sysent = &aout_sysvec; return 0; } static int NextByte(void *vp) { int error; struct imgact_gzip *igz = (struct imgact_gzip *) vp; if (igz->idx >= igz->len) { igz->where = __LINE__; return GZ_EOF; } if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { return igz->inbuf[(igz->idx++) - igz->offset]; } if (igz->inbuf) { error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, (vm_offset_t) igz->inbuf + PAGE_SIZE); if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } } igz->offset = igz->idx & ~PAGE_MASK; error = vm_mmap(kernel_map, /* map */ (vm_offset_t *) & igz->inbuf, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ (caddr_t) igz->ip->vp, /* vnode */ igz->offset); /* offset */ if (error) { igz->where = __LINE__; igz->error = error; return GZ_EOF; } return igz->inbuf[(igz->idx++) - igz->offset]; } static int Flush(void *vp, u_char * ptr, u_long siz) { struct imgact_gzip *gz = (struct imgact_gzip *) vp; u_char *p = ptr, *q; int i; /* First, find a a.out-header */ if (gz->output < sizeof gz->a_out) { q = (u_char *) & gz->a_out; i = min(siz, sizeof gz->a_out - gz->output); bcopy(p, q + gz->output, i); gz->output += i; p += i; siz -= i; if (gz->output == sizeof gz->a_out) { i = do_aout_hdr(gz); if (i == -1) { if (!gz->where) gz->where = __LINE__; gz->error = ENOEXEC; return ENOEXEC; } else if (i) { gz->where = __LINE__; gz->error = i; return ENOEXEC; } if (gz->file_offset < sizeof gz->a_out) { q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset; bcopy(&gz->a_out, q, sizeof gz->a_out - gz->file_offset); } } } /* Skip over zero-padded first PAGE if needed */ if (gz->output < gz->file_offset && (gz->output + siz) > gz->file_offset) { i = min(siz, gz->file_offset - gz->output); gz->output += i; p += i; siz -= i; } if (gz->output >= gz->file_offset && gz->output < gz->file_end) { i = min(siz, gz->file_end - gz->output); q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset; bcopy(p, q, i); gz->output += i; p += i; siz -= i; } gz->output += siz; return 0; } /* * Tell kern_execve.c about it, with a little help from the linker. * Since `const' objects end up in the text segment, TEXT_SET is the * correct directive to use. */ static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; TEXT_SET(execsw_set, gzip_execsw); Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 13489) +++ head/sys/kern/init_main.c (revision 13490) @@ -1,629 +1,629 @@ /* * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 - * $Id: init_main.c,v 1.35 1995/12/07 12:46:36 davidg Exp $ + * $Id: init_main.c,v 1.36 1995/12/10 13:45:11 phk Exp $ */ #include #include #include #include #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct linker_set sysinit_set; /* XXX */ extern void __main __P((void)); extern void main __P((void *framep)); /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; static struct pcred cred0; static struct filedesc0 filedesc0; static struct plimit limit0; static struct vmspace vmspace0; struct proc *curproc = &proc0; struct proc *initproc; static int cmask = CMASK; extern struct user *proc0paddr; struct vnode *rootvp; int boothowto; struct timeval boottime; SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, CTLFLAG_RW, &boottime, timeval, ""); struct timeval runtime; /* * Promiscuous argument pass for start_init() * * This is a kludge because we use a return from main() rather than a call * to a new reoutine in locore.s to kick the kernel alive from locore.s. */ static void *init_framep; #if __GNUC__ >= 2 void __main() {} #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL) /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads", like an LFS * cleaner. */ void main(framep) void *framep; { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ int rval[2]; /* SI_TYPE_KTHREAD support*/ /* * Save the locore.s frame pointer for start_init(). */ init_framep = framep; /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). * * Since some things care about execution order, this is the * operation which ensures continued function. */ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { for( xipp = sipp + 1; *xipp; xipp++) { if( (*sipp)->subsystem < (*xipp)->subsystem || ( (*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order < (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. * * The last item on the list is expected to be the scheduler, * which will not return. */ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { if( (*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ switch( (*sipp)->type) { case SI_TYPE_DEFAULT: /* no special processing*/ (*((*sipp)->func))( (*sipp)->udata); break; case SI_TYPE_KTHREAD: /* kernel thread*/ if (fork(&proc0, NULL, rval)) panic("fork kernel process"); if (rval[1]) { (*((*sipp)->func))( (*sipp)->udata); /* * The call to start "init" returns * here after the scheduler has been * started, and returns to the caller * in i386/i386/locore.s. This is a * necessary part of initialization * and is rather non-obvious. * * No other "kernel threads" should * return here. Call panic() instead. */ return; } break; default: panic( "init_main: unrecognized init type"); } } /* NOTREACHED*/ } /* * Start a kernel process. This is called after a fork() call in * main() in the file kern/init_main.c. * * This function is used to start "internal" daemons. */ /* ARGSUSED*/ void kproc_start(udata) void *udata; { struct kproc_desc *kp = udata; struct proc *p = curproc; /* save a global descriptor, if desired*/ if( kp->global_procpp != NULL) *kp->global_procpp = p; /* this is a non-swapped system process*/ p->p_flag |= P_INMEM | P_SYSTEM; /* set up arg0 for 'ps', et al*/ strcpy( p->p_comm, kp->arg0); /* call the processes' main()...*/ (*kp->func)(); /* NOTREACHED */ panic("kproc_start: %s", kp->arg0); } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ #ifdef OMIT /* * Handled by vfs_mountroot (bad idea) at this time... should be * done the same as 4.4Lite2. */ SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL) #endif /* OMIT*/ /* * Should get its own file... */ #ifdef HPFPLIB char copyright[] = "Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n"; #else char copyright[] = "Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n"; #endif static void print_caddr_t __P((void *data)); static void print_caddr_t(data) void *data; { printf("%s", (char *)data); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) /* *************************************************************************** **** **** The two following SYSINT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init __P((void *dummy)); static void proc0_init(dummy) void *dummy; { register struct proc *p; register struct filedesc0 *fdp; register int i; /* * Initialize the current process pointer (curproc) before * any possible traps/probes to simplify trap processing. */ p = &proc0; curproc = p; /* XXX redundant*/ /* * Create process 0 (the swapper). */ allproc = (volatile struct proc *)p; p->p_prev = (struct proc **)&allproc; p->p_pgrp = &pgrp0; pgrphash[0] = &pgrp0; pgrp0.pg_mem = p; pgrp0.pg_session = &session0; session0.s_count = 1; session0.s_leader = p; p->p_sysent = &aout_sysvec; p->p_flag = P_INMEM | P_SYSTEM; p->p_stat = SRUN; p->p_nice = NZERO; p->p_rtprio.type = RTP_PRIO_NORMAL; p->p_rtprio.prio = 0; bcopy("swapper", p->p_comm, sizeof ("swapper")); /* Create credentials. */ cred0.p_refcnt = 1; p->p_cred = &cred0; p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ /* Create the file descriptor table. */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = cmask; fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; fdp->fd_fd.fd_nfiles = NDFILE; /* Create the limits structures. */ p->p_limit = &limit0; for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) limit0.pl_rlimit[i].rlim_cur = limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE; limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC; i = ptoa(cnt.v_free_count); limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; limit0.p_refcnt = 1; /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit(&vmspace0.vm_pmap); vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), trunc_page(VM_MAX_ADDRESS), TRUE); vmspace0.vm_map.pmap = &vmspace0.vm_pmap; p->p_addr = proc0paddr; /* XXX */ #define INCOMPAT_LITES2 #ifdef INCOMPAT_LITES2 /* * proc0 needs to have a coherent frame base, too. * This probably makes the identical call for the init proc * that happens later unnecessary since it should inherit * it during the fork. */ cpu_set_init_frame(p, init_framep); /* XXX! */ #endif /* INCOMPAT_LITES2*/ /* * We continue to place resource usage info and signal * actions in the user struct so they're pageable. */ p->p_stats = &p->p_addr->u_stats; p->p_sigacts = &p->p_addr->u_sigacts; /* * Initialize per uid information structure and charge * root for one process. */ usrinfoinit(); (void)chgproccnt(0, 1); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) /* ARGSUSED*/ static void proc0_post __P((void *dummy)); static void proc0_post(dummy) void *dummy; { /* * Now can look at time, having had a chance to verify the time * from the file system. Reset p->p_rtime as it may have been * munched in mi_switch() after the time got set. */ proc0.p_stats->p_start = runtime = mono_time = boottime = time; proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0; /* Initialize signal state for process 0. */ siginit(&proc0); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* ARGSUSED*/ static void sched_setup __P((void *dummy)); static void sched_setup(dummy) void *dummy; { /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); } SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) /* ARGSUSED*/ static void xxx_vfs_mountroot __P((void *dummy)); static void xxx_vfs_mountroot(dummy) void *dummy; { /* Mount the root file system. */ if ((*mountroot)(mountrootvfsops)) panic("cannot mount root"); } SYSINIT(mountroot, SI_SUB_ROOT, SI_ORDER_FIRST, xxx_vfs_mountroot, NULL) /* ARGSUSED*/ static void xxx_vfs_root_fdtab __P((void *dummy)); static void xxx_vfs_root_fdtab(dummy) void *dummy; { register struct filedesc0 *fdp = &filedesc0; /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) panic("cannot find root vnode"); fdp->fd_fd.fd_cdir = rootvnode; VREF(fdp->fd_fd.fd_cdir); VOP_UNLOCK(rootvnode); fdp->fd_fd.fd_rdir = NULL; } SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL) /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. It is here for two reasons only: **** **** 1) This code returns to startup the system; this is **** abnormal for a kernel thread. **** 2) This code promiscuously uses init_frame **** *************************************************************************** */ static void kthread_init __P((void *dummy)); SYSINIT_KT(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL) static void start_init __P((struct proc *p, void *framep)); /* ARGSUSED*/ static void kthread_init(dummy) void *dummy; { /* Create process 1 (init(8)). */ start_init(curproc, init_framep); /* * This is the only kernel thread allowed to return yo the * caller!!! */ return; } /* * List of paths to try when searching for "init". */ static char *initpaths[] = { "/sbin/init", "/sbin/oinit", "/sbin/init.bak", "/stand/sysinstall", NULL, }; /* * Start the initial user process; try exec'ing each pathname in "initpaths". * The program is invoked with one argument containing the boot flags. */ static void start_init(p, framep) struct proc *p; void *framep; { vm_offset_t addr; struct execve_args args; int options, i, retval[2], error; char **pathp, *path, *ucp, **uap, *arg0, *arg1; initproc = p; /* * We need to set the system call frame as if we were entered through * a syscall() so that when we call execve() below, it will be able * to set the entry point (see setregs) when it tries to exec. The * startup code in "locore.s" has allocated space for the frame and * passed a pointer to that space as main's argument. */ cpu_set_init_frame(p, framep); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE); - if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE) != 0) + if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { /* * Move out the boot flag argument. */ options = 0; ucp = (char *)USRSTACK; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ for (i = strlen(path) + 1; i >= 0; i--) (void)subyte(--ucp, path[i]); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((int)ucp & ~(NBPW-1)); (void)suword((caddr_t)--uap, 0); /* terminator */ (void)suword((caddr_t)--uap, (int)arg1); (void)suword((caddr_t)--uap, (int)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise return to main() which returns to btext * which completes the system startup. */ if ((error = execve(p, &args, &retval[0])) == 0) return; if (error != ENOENT) printf("exec %s: error %d\n", path, error); } printf("init: not found\n"); panic("no init"); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 13489) +++ head/sys/kern/kern_exec.c (revision 13490) @@ -1,584 +1,584 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by David Greenman * 4. The name of the developer may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: kern_exec.c,v 1.31 1996/01/04 20:28:45 wollman Exp $ + * $Id: kern_exec.c,v 1.32 1996/01/08 04:30:41 peter Exp $ */ #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int *exec_copyout_strings __P((struct image_params *)); static int exec_check_permissions(struct image_params *); /* * execsw_set is constructed for us by the linker. Each of the items * is a pointer to a `const struct execsw', hence the double pointer here. */ static const struct execsw **execsw = (const struct execsw **)&execsw_set.ls_items[0]; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * execve() system call. */ int execve(p, uap, retval) struct proc *p; register struct execve_args *uap; int *retval; { struct nameidata nd, *ndp; int *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; imgp = &image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->uap = uap; imgp->attr = &attr; imgp->image_header = NULL; imgp->argc = imgp->envc = 0; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_pageable(exec_map, ARG_MAX); if (imgp->stringbase == NULL) { error = ENOMEM; goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, uap->fname, p); interpret: error = namei(ndp); if (error) { kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); goto exec_fail; } imgp->vp = ndp->ni_vp; if (imgp->vp == NULL) { error = ENOEXEC; goto exec_fail_dealloc; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); /* * Lose the lock on the vnode. It's no longer needed, and must not * exist for the pagefault paging to work below. */ VOP_UNLOCK(imgp->vp); if (error) goto exec_fail_dealloc; /* * Map the image header (first page) of the file into * kernel address space */ error = vm_mmap(kernel_map, /* map */ (vm_offset_t *)&imgp->image_header, /* address */ PAGE_SIZE, /* size */ VM_PROT_READ, /* protection */ VM_PROT_READ, /* max protection */ 0, /* flags */ (caddr_t)imgp->vp, /* vnode */ 0); /* offset */ if (error) { uprintf("mmap failed: %d\n",error); goto exec_fail_dealloc; } /* * Loop through list of image activators, calling each one. * If there is no match, the activator returns -1. If there * is a match, but there was an error during the activation, * the error is returned. Otherwise 0 means success. If the * image is interpreted, loop back up and try activating * the interpreter. */ for (i = 0; execsw[i]; ++i) { if (execsw[i]->ex_imgact) error = (*execsw[i]->ex_imgact)(imgp); else continue; if (error == -1) continue; if (error) goto exec_fail_dealloc; if (imgp->interpreted) { /* free old vnode and name buffer */ vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (1)"); /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, p); goto interpret; } break; } /* If we made it through all the activators and none matched, exit. */ if (error == -1) { error = ENOEXEC; goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base */ stack_base = exec_copyout_strings(imgp); p->p_vmspace->vm_minsaddr = (char *)stack_base; /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* close files on exec */ fdcloseexec(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as executable, wakeup any process that was vforked and tell * it that it now has it's own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup((caddr_t)p->p_pptr); } /* * Implement image setuid/setgid. Disallow if the process is * being traced. */ if ((attr.va_mode & (VSUID | VSGID)) && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. */ if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) { p->p_traceflag = 0; vrele(p->p_tracep); p->p_tracep = NULL; } /* * Set the new credentials. */ p->p_ucred = crcopy(p->p_ucred); if (attr.va_mode & VSUID) p->p_ucred->cr_uid = attr.va_uid; if (attr.va_mode & VSGID) p->p_ucred->cr_groups[0] = attr.va_gid; p->p_flag |= P_SUGID; } else { p->p_flag &= ~P_SUGID; } /* * Implement correct POSIX saved-id behavior. */ p->p_cred->p_svuid = p->p_ucred->cr_uid; p->p_cred->p_svgid = p->p_ucred->cr_gid; /* * Store the vp for use in procfs */ if (p->p_textvp) /* release old reference */ vrele(p->p_textvp); VREF(ndp->ni_vp); p->p_textvp = ndp->ni_vp; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Set entry address */ setregs(p, imgp->entry_addr, (u_long)stack_base); /* * free various allocated resources */ kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (2)"); vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); return (0); exec_fail_dealloc: if (imgp->stringbase != NULL) kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); if (imgp->image_header && imgp->image_header != (char *)-1) if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header, (vm_offset_t)imgp->image_header + PAGE_SIZE)) panic("execve: header dealloc failed (3)"); if (ndp->ni_vp) vrele(ndp->ni_vp); FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); exec_fail: if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(p, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ return(0); } else { return(error); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp) struct image_params *imgp; { int error; struct vmspace *vmspace = imgp->proc->p_vmspace; caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ); imgp->vmspace_destroyed = 1; /* Blow away entire process VM */ if (vmspace->vm_shm) shmexit(imgp->proc); vm_map_remove(&vmspace->vm_map, 0, USRSTACK); /* Allocate a new stack */ error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr, - SGROWSIZ, FALSE); + SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) return(error); vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; /* Initialize maximum stack address */ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; return(0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error, length; /* * extract arguments first */ argv = imgp->uap->argv; if (argv) { while ((argp = (caddr_t) fuword(argv++))) { if (argp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } } /* * extract environment strings */ envv = imgp->uap->envv; if (envv) { while ((envp = (caddr_t) fuword(envv++))) { if (envp == (caddr_t) -1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return(E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ int * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; int *stack_base; struct ps_strings *arginfo; /* * Calculate string base and vector table pointers. */ arginfo = PS_STRINGS; destp = (caddr_t)arginfo - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * The '+ 2' is for the null pointers at the end of each of the * arg and env vector sets */ vectp = (char **) (destp - (imgp->argc + imgp->envc + 2) * sizeof(char *)); /* * vectp also becomes our initial stack base */ stack_base = (int *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (int)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (int)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer seperates the argp's from the envp's */ suword(vectp++, NULL); suword(&arginfo->ps_envstr, (int)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (int)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, NULL); return (stack_base); } /* * Check permissions of file to execute. * Return 0 for success or error code on failure. */ static int exec_check_permissions(imgp) struct image_params *imgp; { struct proc *p = imgp->proc; struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; int error; /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) { return (ETXTBSY); } /* Get file attributes */ error = VOP_GETATTR(vp, attr, p->p_ucred, p); if (error) return (error); /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) { return (EACCES); } /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Disable setuid/setgid if the filesystem prohibits it or if * the process is being traced. */ if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) attr->va_mode &= ~(VSUID | VSGID); /* * Check for execute permission to file based on current credentials. * Then call filesystem specific open routine (which does nothing * in the general case). */ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); if (error) return (error); error = VOP_OPEN(vp, FREAD, p->p_ucred, p); if (error) return (error); return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 13489) +++ head/sys/kern/kern_exit.c (revision 13490) @@ -1,516 +1,516 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 - * $Id: kern_exit.c,v 1.25 1996/01/04 20:28:46 wollman Exp $ + * $Id: kern_exit.c,v 1.26 1996/01/08 04:30:44 peter Exp $ */ #include "opt_ktrace.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_43 #include #include #endif #include #include #include #include #include #include #include static int wait1 __P((struct proc *, struct wait_args *, int [], int)); /* * exit -- * Death of process. */ __dead void exit(p, uap, retval) struct proc *p; struct rexit_args /* { int rval; } */ *uap; int *retval; { exit1(p, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. */ __dead void exit1(p, rv) register struct proc *p; int rv; { register struct proc *q, *nq; register struct proc **pp; register struct vmspace *vm; if (p->p_pid == 1) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } #ifdef PGINPROF vmsizmon(); #endif if (p->p_flag & P_PROFIL) stopprofclock(p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), M_ZOMBIE, M_WAITOK); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ p->p_flag &= ~(P_TRACED | P_PPWAIT); p->p_flag |= P_WEXIT; p->p_sigignore = ~0; p->p_siglist = 0; untimeout(realitexpire, (caddr_t)p); /* * Close open files and release open-file table. * This may block! */ fdfree(p); /* * XXX Shutdown SYSV semaphores */ semexit(p); /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; if (vm->vm_shm) shmexit(p); /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * Can't free the entire vmspace as the kernel stack * may be mapped within that space also. */ if (vm->vm_refcnt == 1) (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); if (SESS_LEADER(p)) { register struct session *sp = p->p_session; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ if (sp->s_ttyp->t_session == sp) { if (sp->s_ttyp->t_pgrp) pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); (void) ttywait(sp->s_ttyp); /* * The tty could have been revoked * if we blocked. */ if (sp->s_ttyvp) vgoneall(sp->s_ttyvp); } if (sp->s_ttyvp) vrele(sp->s_ttyvp); sp->s_ttyvp = NULL; /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } sp->s_leader = NULL; } fixjobc(p, p->p_pgrp, 0); p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; (void)acct_process(p); #ifdef KTRACE /* * release trace file */ p->p_traceflag = 0; /* don't trace the vrele() */ if (p->p_tracep) vrele(p->p_tracep); #endif /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ if ((*p->p_prev = p->p_next)) p->p_next->p_prev = p->p_prev; if ((p->p_next = zombproc)) p->p_next->p_prev = &p->p_next; p->p_prev = &zombproc; zombproc = p; p->p_stat = SZOMB; for (pp = &pidhash[PIDHASH(p->p_pid)]; *pp; pp = &(*pp)->p_hash) if (*pp == p) { *pp = p->p_hash; goto done; } panic("exit"); done: if (p->p_cptr) /* only need this if any child is S_ZOMB */ wakeup((caddr_t) initproc); for (q = p->p_cptr; q != NULL; q = nq) { nq = q->p_osptr; if (nq != NULL) nq->p_ysptr = NULL; if (initproc->p_cptr) initproc->p_cptr->p_ysptr = q; q->p_osptr = initproc->p_cptr; q->p_ysptr = NULL; initproc->p_cptr = q; q->p_pptr = initproc; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; psignal(q, SIGKILL); } } p->p_cptr = NULL; /* * Save exit status and final rusage info, adding in child rusage * info and self times. */ p->p_xstat = rv; *p->p_ru = p->p_stats->p_ru; calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); ruadd(p->p_ru, &p->p_stats->p_cru); /* * Notify parent that we're gone. */ psignal(p->p_pptr, SIGCHLD); wakeup((caddr_t)p->p_pptr); #if defined(tahoe) /* move this to cpu_exit */ p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL; #endif /* * Clear curproc after we've done all operations * that could block, and before tearing down the rest * of the process state that might be used from clock, etc. * Also, can't clear curproc while we're still runnable, * as we're not on a run queue (we are current, just not * a proper proc any longer!). * * Other substructures are freed from wait(). */ curproc = NULL; if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); p->p_limit = NULL; } /* * Finally, call machine-dependent code to release the remaining * resources including address space, the kernel stack and pcb. * The address space is released by "vmspace_free(p->p_vmspace)"; * This is machine-dependent, as we may have to change stacks * or ensure that the current one isn't reallocated before we * finish. cpu_exit will end with a call to cpu_swtch(), finishing * our execution (pun intended). */ cpu_exit(p); } #ifdef COMPAT_43 #if defined(hp300) || defined(luna68k) #include #define GETPS(rp) ((struct frame *)(rp))->f_sr #else #define GETPS(rp) (rp)[PS] #endif int owait(p, uap, retval) struct proc *p; register struct owait_args /* { int dummy; } */ *uap; int *retval; { struct wait_args w; #ifdef PSL_ALLCC if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { w.options = 0; w.rusage = NULL; } else { w.options = p->p_md.md_regs[R0]; w.rusage = (struct rusage *)p->p_md.md_regs[R1]; } #else w.options = 0; w.rusage = NULL; #endif w.pid = WAIT_ANY; w.status = NULL; return (wait1(p, &w, retval, 1)); } #endif /* COMPAT_43 */ int wait4(p, uap, retval) struct proc *p; struct wait_args *uap; int *retval; { return (wait1(p, uap, retval, 0)); } static int wait1(q, uap, retval, compat) register struct proc *q; register struct wait_args /* { int pid; int *status; int options; struct rusage *rusage; } */ *uap; int retval[]; int compat; { register int nfound; register struct proc *p, *t; int status, error; if (uap->pid == 0) uap->pid = -q->p_pgid; #ifdef notyet if (uap->options &~ (WUNTRACED|WNOHANG)) return (EINVAL); #endif loop: nfound = 0; for (p = q->p_cptr; p; p = p->p_osptr) { if (uap->pid != WAIT_ANY && p->p_pid != uap->pid && p->p_pgid != -uap->pid) continue; nfound++; if (p->p_stat == SZOMB) { /* charge childs scheduling cpu usage to parent */ if (curproc->p_pid != 1) { curproc->p_estcpu = min(curproc->p_estcpu + p->p_estcpu, UCHAR_MAX); } retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) retval[1] = p->p_xstat; else #endif if (uap->status) { status = p->p_xstat; /* convert to int */ if ((error = copyout((caddr_t)&status, (caddr_t)uap->status, sizeof(status)))) return (error); } if (uap->rusage && (error = copyout((caddr_t)p->p_ru, (caddr_t)uap->rusage, sizeof (struct rusage)))) return (error); /* * If we got the child via a ptrace 'attach', * we need to give it back to the old parent. */ if (p->p_oppid && (t = pfind(p->p_oppid))) { p->p_oppid = 0; proc_reparent(p, t); psignal(t, SIGCHLD); wakeup((caddr_t)t); return (0); } p->p_xstat = 0; ruadd(&q->p_stats->p_cru, p->p_ru); FREE(p->p_ru, M_ZOMBIE); p->p_ru = NULL; /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_cred->p_ruid, -1); /* + * Release reference to text vnode + */ + if (p->p_textvp) + vrele(p->p_textvp); + + /* * Free up credentials. */ if (--p->p_cred->p_refcnt == 0) { crfree(p->p_cred->pc_ucred); FREE(p->p_cred, M_SUBPROC); p->p_cred = NULL; } - - /* - * Release reference to text vnode - */ - if (p->p_textvp) - vrele(p->p_textvp); /* * Finally finished with old proc entry. * Unlink it from its process group and free it. */ leavepgrp(p); if ((*p->p_prev = p->p_next)) /* off zombproc */ p->p_next->p_prev = p->p_prev; if ((q = p->p_ysptr)) q->p_osptr = p->p_osptr; if ((q = p->p_osptr)) q->p_ysptr = p->p_ysptr; if ((q = p->p_pptr)->p_cptr == p) q->p_cptr = p->p_osptr; /* * Give machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ cpu_wait(p); FREE(p, M_PROC); nprocs--; return (0); } if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { p->p_flag |= P_WAITED; retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) { retval[1] = W_STOPCODE(p->p_xstat); error = 0; } else #endif if (uap->status) { status = W_STOPCODE(p->p_xstat); error = copyout((caddr_t)&status, (caddr_t)uap->status, sizeof(status)); } else error = 0; return (error); } } if (nfound == 0) return (ECHILD); if (uap->options & WNOHANG) { retval[0] = 0; return (0); } if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))) return (error); goto loop; } /* * make process 'parent' the new parent of process 'child'. */ void proc_reparent(child, parent) register struct proc *child; register struct proc *parent; { register struct proc *o; register struct proc *y; if (child->p_pptr == parent) return; /* fix up the child linkage for the old parent */ o = child->p_osptr; y = child->p_ysptr; if (y) y->p_osptr = o; if (o) o->p_ysptr = y; if (child->p_pptr->p_cptr == child) child->p_pptr->p_cptr = o; /* fix up child linkage for new parent */ o = parent->p_cptr; if (o) o->p_ysptr = child; child->p_osptr = o; child->p_ysptr = NULL; parent->p_cptr = child; child->p_pptr = parent; } Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 13489) +++ head/sys/kern/subr_trap.c (revision 13490) @@ -1,1061 +1,1062 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $ + * $Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $ */ /* * 386 Trap and System call handling */ #include "opt_ktrace.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef POWERFAIL_NMI # include # include #endif #include "isa.h" #include "npx.h" int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void linux_syscall __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int)); static void trap_fatal __P((struct trapframe *)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 27 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ }; static void userret __P((struct proc *p, struct trapframe *frame, u_quad_t oticks)); static inline void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig, s; while ((sig = CURSIG(p)) != 0) postsig(sig); p->p_priority = p->p_usrpri; if (want_resched) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ s = splclock(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); splx(s); while ((sig = CURSIG(p)) != 0) postsig(sig); } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { u_quad_t ticks = p->p_sticks - oticks; if (ticks) { #ifdef PROFTIMER extern int profscale; addupc(frame->tf_eip, &p->p_stats->p_prof, ticks * profscale); #else addupc(frame->tf_eip, &p->p_stats->p_prof, ticks); #endif } } curpriority = p->p_priority; } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; #ifdef DEBUG u_long eva; #endif type = frame.tf_trapno; code = frame.tf_err; if (ISPL(frame.tf_cs) == SEL_UPL) { /* user trap */ sticks = p->p_sticks; p->p_md.md_regs = (int *)&frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; case T_ASTFLT: /* Allow process switch */ astoff(); cnt.v_soft++; if (p->p_flag & P_OWEUPC) { addupc(frame.tf_eip, &p->p_stats->p_prof, 1); p->p_flag &= ~P_OWEUPC; } goto out; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ case T_STKFLT: /* stack fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE); if (i == -1) return; if (i == 0) goto out; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV_TRAP; i = SIGFPE; break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI goto handle_powerfail; #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; panic("NMI indicates hardware failure"); #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF_TRAP; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_SUBRNG_TRAP; i = SIGFPE; break; case T_DNA: #if NNPX > 0 /* if a transparent fault (due to context switch "late") */ if (npxdna()) return; #endif /* NNPX > 0 */ if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) return; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE); return; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ #define MAYBE_DORETI_FAULT(where, whereto) \ do { \ if (frame.tf_eip == (int)where) { \ frame.tf_eip = (int)whereto; \ return; \ } \ } while (0) if (intr_nesting_level == 0) { MAYBE_DORETI_FAULT(doreti_iret, doreti_iret_fault); MAYBE_DORETI_FAULT(doreti_popl_ds, doreti_popl_ds_fault); MAYBE_DORETI_FAULT(doreti_popl_es, doreti_popl_es_fault); } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ return; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; return; } /* * Fall through. */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB if (kdb_trap (type, 0, &frame)) return; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif handle_powerfail: { static unsigned lastalert = 0; if(time.tv_sec - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time.tv_sec; } return; } #else /* !POWERFAIL_NMI */ #ifdef DDB /* NMI can be hooked up to a pushbutton for debugging */ printf ("NMI ... going to debugger\n"); if (kdb_trap (type, 0, &frame)) return; #endif /* DDB */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) return; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame); return; } trapsignal(p, i, ucode); #ifdef DEBUG eva = rcr2(); if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%x", eva); uprintf("\n"); } #endif out: userret(p, &frame, sticks); } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t ptepg; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (curpcb == NULL || curpcb->pcb_onfault == NULL))) { trap_fatal(frame); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; int eva; struct proc *p = curproc; eva = rcr2(); va = trunc_page((vm_offset_t)eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { vm_offset_t v; /* * Keep swapout from messing with us during this * critical time. */ ++p->p_lock; /* * Grow the stack if necessary */ if ((caddr_t)va > vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { rv = KERN_FAILURE; --p->p_lock; goto nogo; } } /* * Check if page table is mapped, if not, * fault it first */ v = (vm_offset_t) vtopte(va); /* Fault the pte only if needed: */ if (*((int *)vtopte(v)) == 0) - (void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE); + (void) vm_fault(map, + trunc_page(v), VM_PROT_WRITE, FALSE); pmap_use_pt( vm_map_pmap(map), va); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, FALSE); pmap_unuse_pt( vm_map_pmap(map), va); --p->p_lock; } else { /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(map, va, ftype, FALSE); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (curpcb && curpcb->pcb_onfault) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame) struct trapframe *frame; { int code, type, eva; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; eva = rcr2(); sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace/trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } printf("interrupt mask = "); if ((cpl & net_imask) == net_imask) printf("net "); if ((cpl & tty_imask) == tty_imask) printf("tty "); if ((cpl & bio_imask) == bio_imask) printf("bio "); if (cpl == 0) printf("none"); printf("\n"); #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if (kdb_trap (type, 0, frame)) return; #endif if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { struct pcb *pcb = curpcb; if (pcb != NULL) { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip); printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp); printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp); } panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va, v; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; ++p->p_lock; if ((caddr_t)va >= vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (!grow(p, va)) { --p->p_lock; return (1); } } v = trunc_page(vtopte(va)); /* * wire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE); /* * unwire the pte page */ if (va < USRSTACK) { vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE); } --p->p_lock; if (rv != KERN_SUCCESS) return 1; return (0); } /* * System call request from POSIX system call gate interface to kernel. * Like trap(), argument is call by reference. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int args[8], rval[2]; u_int code; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("syscall"); p->p_md.md_regs = (int *)&frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; if ((i = callp->sy_narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif goto bad; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif rval[0] = 0; rval[1] = frame.tf_edx; error = (*callp->sy_call)(p, args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_edx = rval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes. */ frame.tf_eip -= 7; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #if defined(COMPAT_LINUX) || defined(LINUX) void linux_syscall(frame) struct trapframe frame; { struct proc *p = curproc; struct sysent *callp; u_quad_t sticks; int error; int rval[2]; u_int code; struct linux_syscall_args { int arg1; int arg2; int arg3; int arg4; int arg5; } args; args.arg1 = frame.tf_ebx; args.arg2 = frame.tf_ecx; args.arg3 = frame.tf_edx; args.arg4 = frame.tf_esi; args.arg5 = frame.tf_edi; sticks = p->p_sticks; if (ISPL(frame.tf_cs) != SEL_UPL) panic("linux syscall"); p->p_md.md_regs = (int *)&frame; code = frame.tf_eax; if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args); #endif rval[0] = 0; error = (*callp->sy_call)(p, &args, rval); switch (error) { case 0: /* * Reinitialize proc pointer `p' as it may be different * if this is a child returning from fork syscall. */ p = curproc; frame.tf_eax = rval[0]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* Reconstruct pc, subtract size of int 0x80 */ frame.tf_eip -= 2; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; frame.tf_eax = -error; frame.tf_eflags |= PSL_C; break; } if (frame.tf_eflags & PSL_T) { /* Traced syscall. */ frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, rval[0]); #endif } #endif /* COMPAT_LINUX || LINUX */ Index: head/sys/kern/sys_process.c =================================================================== --- head/sys/kern/sys_process.c (revision 13489) +++ head/sys/kern/sys_process.c (revision 13490) @@ -1,371 +1,371 @@ /* * Copyright (c) 1994, Sean Eric Fagan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: sys_process.c,v 1.18 1995/12/16 21:43:47 bde Exp $ + * $Id: sys_process.c,v 1.19 1995/12/17 06:59:36 bde Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int pread (struct proc *procp, unsigned int addr, unsigned int *retval) { int rv; vm_map_t map, tmap; vm_object_t object; vm_offset_t kva = 0; int page_offset; /* offset into page */ vm_offset_t pageno; /* page number */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired, single_use; vm_pindex_t pindex; /* Map page into kernel space */ map = &procp->p_vmspace->vm_map; page_offset = addr - trunc_page(addr); pageno = trunc_page(addr); tmap = map; rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry, &object, &pindex, &out_prot, &wired, &single_use); if (rv != KERN_SUCCESS) return EINVAL; vm_map_lookup_done (tmap, out_entry); /* Find space in kernel_map for the page we're interested in */ - rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), &kva, - PAGE_SIZE, 1); + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0); if (!rv) { vm_object_reference (object); rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); if (!rv) { *retval = 0; bcopy ((caddr_t)kva + page_offset, retval, sizeof *retval); } vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); } return rv; } static int pwrite (struct proc *procp, unsigned int addr, unsigned int datum) { int rv; vm_map_t map, tmap; vm_object_t object; vm_offset_t kva = 0; int page_offset; /* offset into page */ vm_offset_t pageno; /* page number */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired, single_use; vm_pindex_t pindex; boolean_t fix_prot = 0; /* Map page into kernel space */ map = &procp->p_vmspace->vm_map; page_offset = addr - trunc_page(addr); pageno = trunc_page(addr); /* * Check the permissions for the area we're interested in. */ if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE, VM_PROT_WRITE) == FALSE) { /* * If the page was not writable, we make it so. * XXX It is possible a page may *not* be read/executable, * if a process changes that! */ fix_prot = 1; /* The page isn't writable, so let's try making it so... */ if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE, VM_PROT_ALL, 0)) != KERN_SUCCESS) return EFAULT; /* I guess... */ } /* * Now we need to get the page. out_entry, out_prot, wired, and * single_use aren't used. One would think the vm code would be * a *bit* nicer... We use tmap because vm_map_lookup() can * change the map argument. */ tmap = map; rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry, &object, &pindex, &out_prot, &wired, &single_use); if (rv != KERN_SUCCESS) { return EINVAL; } /* * Okay, we've got the page. Let's release tmap. */ vm_map_lookup_done (tmap, out_entry); /* * Fault the page in... */ vm_map_pageable(map, trunc_page(vtopte(pageno)), trunc_page(vtopte(pageno)) + PAGE_SIZE, FALSE); rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE); vm_map_pageable(map, trunc_page(vtopte(pageno)), trunc_page(vtopte(pageno)) + PAGE_SIZE, TRUE); if (rv != KERN_SUCCESS) return EFAULT; /* Find space in kernel_map for the page we're interested in */ - rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), &kva, - PAGE_SIZE, 1); - + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, + VM_PROT_ALL, VM_PROT_ALL, 0); if (!rv) { vm_object_reference (object); rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); if (!rv) { bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum); } vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); } if (fix_prot) vm_map_protect (map, pageno, pageno + PAGE_SIZE, VM_PROT_READ|VM_PROT_EXECUTE, 0); return rv; } /* * Process debugging system call. */ #ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; #endif int ptrace(curp, uap, retval) struct proc *curp; struct ptrace_args *uap; int *retval; { struct proc *p; int error = 0; *retval = 0; if (uap->req == PT_TRACE_ME) { curp->p_flag |= P_TRACED; return 0; } if ((p = pfind(uap->pid)) == NULL) { return ESRCH; } #ifdef PT_ATTACH if (uap->req != PT_ATTACH && ( (p->p_flag & P_TRACED) == 0 || (p->p_tptr && curp != p->p_tptr) || (!p->p_tptr && curp != p->p_pptr))) return ESRCH; #endif #ifdef PT_ATTACH if (uap->req != PT_ATTACH) { #endif if ((p->p_flag & P_TRACED) == 0) return EPERM; if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) return EBUSY; #ifdef PT_ATTACH } #endif /* * XXX The PT_ATTACH code is completely broken. It will * be obsoleted by a /proc filesystem, so is it worth it * to fix it? (Answer, probably. So that'll be next, * I guess.) */ switch (uap->req) { #ifdef PT_ATTACH case PT_ATTACH: if (curp->p_ucred->cr_uid != 0 && ( curp->p_ucred->cr_uid != p->p_ucred->cr_uid || curp->p_ucred->cr_uid != p->p_cred->p_svuid)) return EACCES; p->p_tptr = curp; p->p_flag |= P_TRACED; psignal(p, SIGSTOP); return 0; case PT_DETACH: if ((unsigned)uap->data >= NSIG) return EINVAL; p->p_flag &= ~P_TRACED; p->p_tptr = NULL; psignal(p->p_pptr, SIGCHLD); wakeup((caddr_t)p->p_pptr); s = splhigh(); if (p->p_stat == SSTOP) { p->p_xstat = uap->data; setrunnable(p); } else if (uap->data) { psignal(p, uap->data); } splx(s); return 0; # ifdef PT_INHERIT case PT_INHERIT: if ((p->p_flag & P_TRACED) == 0) return ESRCH; return 0; # endif /* PT_INHERIT */ #endif /* PT_ATTACH */ case PT_READ_I: case PT_READ_D: if ((error = pread (p, (unsigned int)uap->addr, retval))) return error; return 0; case PT_WRITE_I: case PT_WRITE_D: if ((error = pwrite (p, (unsigned int)uap->addr, (unsigned int)uap->data))) return error; return 0; case PT_STEP: if ((error = ptrace_single_step (p))) return error; /* fallthrough */ case PT_CONTINUE: /* * Continue at addr uap->addr with signal * uap->data; if uap->addr is 1, then we just * let the chips fall where they may. * * The only check I'll make right now is for * uap->data to be larger than NSIG; if so, we return * EINVAL. */ if (uap->data >= NSIG) return EINVAL; if (uap->addr != (caddr_t)1) { fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); if ((error = ptrace_set_pc (p, (u_int)uap->addr))) return error; } p->p_xstat = uap->data; /* if (p->p_stat == SSTOP) */ setrunnable (p); return 0; case PT_READ_U: if ((u_int)uap->addr > (UPAGES * NBPG - sizeof(int))) { return EFAULT; } p->p_addr->u_kproc.kp_proc = *p; fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); *retval = *(int*)((u_int)p->p_addr + (u_int)uap->addr); return 0; case PT_WRITE_U: p->p_addr->u_kproc.kp_proc = *p; fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); return ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data); case PT_KILL: p->p_xstat = SIGKILL; setrunnable(p); return 0; #ifdef PT_GETREGS case PT_GETREGS: /* * copyout the registers into addr. There's no * size constraint!!! *GRRR* */ return ptrace_getregs(p, uap->addr); case PT_SETREGS: /* * copyin the registers from addr. Again, no * size constraint!!! *GRRRR* */ return ptrace_setregs (p, uap->addr); #endif /* PT_GETREGS */ default: break; } return 0; } int trace_req(p) struct proc *p; { return 1; } Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 13489) +++ head/sys/kern/vfs_bio.c (revision 13490) @@ -1,1654 +1,1667 @@ /* * Copyright (c) 1994 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. This work was done expressly for inclusion into FreeBSD. Other use * is allowed if this notation is included. * 5. Modifications may be freely made to this file if the above conditions * are met. * - * $Id: vfs_bio.c,v 1.82 1996/01/06 23:23:02 davidg Exp $ + * $Id: vfs_bio.c,v 1.83 1996/01/06 23:58:03 davidg Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. */ #include "opt_bounce.h" #define VMIO #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vfs_update __P((void)); static struct proc *updateproc; static struct kproc_desc up_kp = { "update", vfs_update, &updateproc }; SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) struct buf *buf; /* buffer header pool */ struct swqueue bswlist; int count_lock_queue __P((void)); static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); +static void vfs_vmio_release(struct buf *bp); int needsbuffer; /* * Internal update daemon, process 3 * The variable vfs_update_wakeup allows for internal syncs. */ int vfs_update_wakeup; /* * buffers base kva */ caddr_t buffers_kva; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; static vm_offset_t bogus_offset; static int bufspace, maxbufspace; static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; static struct bqueues bufqueues[BUFFER_QUEUES]; +extern int vm_swap_size; + #define BUF_MAXUSE 8 /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); /* first, make a null hash table */ for (i = 0; i < BUFHSZ; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; bp->b_data = buffers_kva + i * MAXBSIZE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is currently calculated to support all filesystem blocks * to be 8K. If you happen to use a 16K filesystem, the size of the buffer * cache is still the same as it would be for 8K filesystems. This * keeps the size of the buffer cache "in check" for big block filesystems. */ maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); } /* * remove the buffer from the appropriate free list */ void bremfree(struct buf * bp) { int s = splbio(); if (bp->b_qindex != QUEUE_NONE) { TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); bp->b_qindex = QUEUE_NONE; } else { panic("bremfree: removing a buffer when not on a queue"); } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); return (biowait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); VOP_STRATEGY(rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async.) */ int bwrite(struct buf * bp) { int oldflags = bp->b_flags; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!(bp->b_flags & B_BUSY)) panic("bwrite: buffer is not busy???"); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); bp->b_flags |= B_WRITEINPROG; if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { reassignbuf(bp, bp->b_vp); } bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); if (curproc != NULL) curproc->p_stats->p_ru.ru_oublock++; VOP_STRATEGY(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); if (oldflags & B_DELWRI) { reassignbuf(bp, bp->b_vp); } brelse(bp); return (rtval); } return (0); } int vn_bwrite(ap) struct vop_bwrite_args *ap; { return (bwrite(ap->a_bp)); } /* * Delayed write. (Buffer is marked dirty). */ void bdwrite(struct buf * bp) { if ((bp->b_flags & B_BUSY) == 0) { panic("bdwrite: buffer is not busy"); } if (bp->b_flags & B_INVAL) { brelse(bp); return; } if (bp->b_flags & B_TAPE) { bawrite(bp); return; } bp->b_flags &= ~(B_READ|B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); } /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if( bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); - brelse(bp); + bqrelse(bp); return; } /* * Asynchronous write. * Start output on a buffer, but do not wait for it to complete. * The buffer is released when the output completes. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) VOP_BWRITE(bp); } /* * Release a buffer. */ void brelse(struct buf * bp) { int s; if (bp->b_flags & B_CLUSTER) { relpbuf(bp); return; } /* anyone need a "free" block? */ s = splbio(); if (needsbuffer) { needsbuffer = 0; wakeup(&needsbuffer); } /* anyone need this block? */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~(B_WANTED | B_AGE); wakeup(bp); } if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || (bp->b_bufsize <= 0)) { bp->b_flags |= B_INVAL; bp->b_flags &= ~(B_DELWRI | B_CACHE); - if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) + if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) { + if (bp->b_bufsize) + allocbuf(bp, 0); brelvp(bp); + } } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, so the B_INVAL flag is used to *invalidate* the buffer, * but the VM object is kept around. The B_NOCACHE flag is used to * invalidate the pages in the VM object. */ if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; vm_object_t obj; int i, resid; vm_page_t m; struct vnode *vp; int iototal = bp->b_bufsize; vp = bp->b_vp; if (!vp) panic("brelse: missing vp"); if (bp->b_npages) { vm_pindex_t poff; obj = (vm_object_t) vp->v_object; if (vp->v_type == VBLK) foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; else foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; poff = OFF_TO_IDX(foff); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, poff + i); if (!m) { panic("brelse: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } resid = IDX_TO_OFF(m->pindex+1) - foff; if (resid > iototal) resid = iototal; if (resid > 0) { /* * Don't invalidate the page if the local machine has already * modified it. This is the lesser of two evils, and should * be fixed. */ if (bp->b_flags & (B_NOCACHE | B_ERROR)) { vm_page_test_dirty(m); if (m->dirty == 0) { vm_page_set_invalid(m, (vm_offset_t) foff, resid); if (m->valid == 0) vm_page_protect(m, VM_PROT_NONE); } } - } - foff += resid; - iototal -= resid; - } - } - - if (bp->b_flags & (B_INVAL | B_RELBUF)) { - for(i = 0; i < bp->b_npages; i++) { - m = bp->b_pages[i]; - --m->bmapped; - if (m->bmapped == 0) { - if (m->flags & PG_WANTED) { - m->flags &= ~PG_WANTED; - wakeup(m); - } - if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) { - if (m->object->flags & OBJ_MIGHTBEDIRTY) { - vm_page_test_dirty(m); + if (resid >= PAGE_SIZE) { + if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { + bp->b_flags |= B_INVAL; } - /* - * if page isn't valid, no sense in keeping it around - */ - if (m->valid == 0) { - vm_page_protect(m, VM_PROT_NONE); - vm_page_free(m); - /* - * if page isn't dirty and hasn't been referenced by - * a process, then cache it - */ - } else if ((m->dirty & m->valid) == 0 && - (m->flags & PG_REFERENCED) == 0 && - !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { - vm_page_cache(m); - /* - * otherwise activate it - */ - } else if ((m->flags & PG_ACTIVE) == 0) { - vm_page_activate(m); - m->act_count = 0; + } else { + if (!vm_page_is_valid(m, + (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) { + bp->b_flags |= B_INVAL; } } } + foff += resid; + iototal -= resid; } - bufspace -= bp->b_bufsize; - pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); - bp->b_npages = 0; - bp->b_bufsize = 0; - bp->b_flags &= ~B_VMIO; - if (bp->b_vp) - brelvp(bp); } + if (bp->b_flags & (B_INVAL | B_RELBUF)) + vfs_vmio_release(bp); } if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_qindex = QUEUE_EMPTY; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ } else if (bp->b_flags & B_AGE) { bp->b_qindex = QUEUE_AGE; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); /* buffers with valid and quite potentially reuseable contents */ } else { bp->b_qindex = QUEUE_LRU; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); } /* unlock */ bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } /* + * Release a buffer. + */ +void +bqrelse(struct buf * bp) +{ + int s; + + s = splbio(); + + if (needsbuffer) { + needsbuffer = 0; + wakeup(&needsbuffer); + } + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + + if (bp->b_qindex != QUEUE_NONE) + panic("bqrelse: free buffer onto another queue???"); + + if (bp->b_flags & B_LOCKED) { + bp->b_flags &= ~B_ERROR; + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + /* buffers with stale but valid contents */ + } else { + bp->b_qindex = QUEUE_LRU; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + + /* unlock */ + bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + splx(s); +} + +static void +vfs_vmio_release(bp) + struct buf *bp; +{ + int i; + vm_page_t m; + + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + bp->b_pages[i] = NULL; + if (m->flags & PG_WANTED) { + m->flags &= ~PG_WANTED; + wakeup(m); + } + vm_page_unwire(m); + if (m->wire_count == 0) { + if (m->valid) { + /* + * this keeps pressure off of the process memory + */ + if ((vm_swap_size == 0) || + (cnt.v_free_count < cnt.v_free_min)) + vm_page_cache(m); + } else if ((m->hold_count == 0) && + ((m->flags & PG_BUSY) == 0) && + (m->busy == 0)) { + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + } + } + } + bufspace -= bp->b_bufsize; + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + bp->b_npages = 0; + bp->b_bufsize = 0; + bp->b_flags &= ~B_VMIO; + if (bp->b_vp) + brelvp(bp); +} + +/* * Check to see if a block is currently memory resident. */ __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp != NULL) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { break; } bp = bp->b_hash.le_next; } return (bp); } /* * this routine implements clustered async writes for * clearing out B_DELWRI buffers... This is much better * than the old way of writing only one buffer at a time. */ int vfs_bio_awrite(struct buf * bp) { int i; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; int nwritten; s = splbio(); /* * right now we support clustered writing only to regular files */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { int size; int maxcl; size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { if ((bpa = gbincore(vp, lblkno + i)) && ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } ncl = i; /* * this is a possible cluster write */ if (ncl != 1) { nwritten = cluster_wbuild(vp, size, lblkno, ncl); splx(s); return nwritten; } } bremfree(bp); splx(s); /* * default (old) behavior, writing out only one block */ bp->b_flags |= B_BUSY | B_ASYNC; nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp); return nwritten; } /* * Find a buffer header which is available for use. */ static struct buf * getnewbuf(int slpflag, int slptimeo, int doingvmio) { struct buf *bp; int s; int nbyteswritten = 0; s = splbio(); start: if (bufspace >= maxbufspace) goto trytofreespace; /* can we constitute a new buffer? */ if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { if (bp->b_qindex != QUEUE_EMPTY) panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", bp->b_qindex); + bp->b_flags |= B_BUSY; bremfree(bp); goto fillbuf; } trytofreespace: /* * We keep the file I/O from hogging metadata I/O * This is desirable because file data is cached in the * VM/Buffer cache even if a buffer is freed. */ if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { if (bp->b_qindex != QUEUE_AGE) panic("getnewbuf: inconsistent AGE queue, qindex=%d", bp->b_qindex); } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { if (bp->b_qindex != QUEUE_LRU) panic("getnewbuf: inconsistent LRU queue, qindex=%d", bp->b_qindex); } if (!bp) { /* wait for a free buffer of any kind */ needsbuffer = 1; tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf", slptimeo); splx(s); return (0); } if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { --bp->b_usecount; TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); if (bufqueues[QUEUE_LRU].tqh_first != NULL) { TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); goto start; } } /* if we are a delayed write, convert to an async write */ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { nbyteswritten += vfs_bio_awrite(bp); if (!slpflag && !slptimeo) { splx(s); return (0); } goto start; } if (bp->b_flags & B_WANTED) { bp->b_flags &= ~B_WANTED; wakeup(bp); } bremfree(bp); + bp->b_flags |= B_BUSY; - if (bp->b_flags & B_VMIO) { - bp->b_flags |= B_RELBUF | B_BUSY | B_DONE; - brelse(bp); - bremfree(bp); - } + if (bp->b_flags & B_VMIO) + vfs_vmio_release(bp); if (bp->b_vp) brelvp(bp); fillbuf: /* we are not free, nor do we contain interesting data */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } - bp->b_flags |= B_BUSY; + LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); splx(s); if (bp->b_bufsize) { allocbuf(bp, 0); } bp->b_flags = B_BUSY; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_validoff = bp->b_validend = 0; bp->b_usecount = 2; if (bufspace >= maxbufspace + nbyteswritten) { s = splbio(); bp->b_flags |= B_INVAL; brelse(bp); goto trytofreespace; } return (bp); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; int s = splbio(); - - bh = BUFHASH(vp, blkno); - bp = bh->lh_first; - - /* Search hash chain */ - while (bp != NULL) { - /* hit */ - if (bp->b_vp == vp && bp->b_lblkno == blkno && - (bp->b_flags & B_INVAL) == 0) { - break; - } - bp = bp->b_hash.le_next; - } + bp = gbincore(vp, blkno); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc; vm_page_t m; vm_ooffset_t off; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) return 0; obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > vp->v_mount->mnt_stat.f_iosize) tinc = vp->v_mount->mnt_stat.f_iosize; off = blkno * vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) return 0; if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) return 0; } return 1; } /* * now we set the dirty range for the buffer -- * for NFS -- if the file is mapped and pages have * been written to, let it know. We want the * entire range of the buffer to be marked dirty if * any of the pages have been written to for consistancy * with the b_validoff, b_validend set in the nfs write * code, and used by the nfs read code. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; vm_offset_t boffset, offset; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) && ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * scan forwards for the first page modified */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) { break; } } boffset = (i << PAGE_SHIFT); if (boffset < bp->b_dirtyoff) { bp->b_dirtyoff = boffset; } /* * scan backwards for the last page modified */ for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } boffset = (i + 1); offset = boffset + bp->b_pages[0]->pindex; if (offset >= object->size) boffset = object->size - bp->b_pages[0]->pindex; if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) bp->b_dirtyend = (boffset << PAGE_SHIFT); } } /* * Get a block given a specified block and offset into a file/device. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; s = splbio(); loop: if ((bp = gbincore(vp, blkno))) { if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; if (!tsleep(bp, (PRIBIO + 1) | slpflag, "getblk", slptimeo)) goto loop; splx(s); return (struct buf *) NULL; } bp->b_flags |= B_BUSY | B_CACHE; bremfree(bp); /* * check for size inconsistancies (note that they shouldn't happen * but do when filesystems don't handle the size changes correctly.) * We are conservative on metadata and don't just extend the buffer * but write and re-constitute it. */ if (bp->b_bcount != size) { if (bp->b_flags & B_VMIO) { allocbuf(bp, size); } else { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp); goto loop; } } - /* - * make sure that all pages in the buffer are valid, if they - * aren't, clear the cache flag. - * ASSUMPTION: - * if the buffer is greater than 1 page in size, it is assumed - * that the buffer address starts on a page boundary... - */ - if (bp->b_flags & B_VMIO) { - int szleft, i; - szleft = size; - for (i=0;ib_npages;i++) { - if (szleft > PAGE_SIZE) { - if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) != - VM_PAGE_BITS_ALL) { - bp->b_flags &= ~(B_CACHE|B_DONE); - break; - } - szleft -= PAGE_SIZE; - } else { - if (!vm_page_is_valid(bp->b_pages[i], - (((vm_offset_t) bp->b_data) & PAGE_MASK), - szleft)) { - bp->b_flags &= ~(B_CACHE|B_DONE); - break; - } - szleft = 0; - } - } - } if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; splx(s); return (bp); } else { vm_object_t obj; int doingvmio; if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { doingvmio = 1; } else { doingvmio = 0; } if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * Normally the vnode is locked so this isn't a problem. * VBLK type I/O requests, however, don't lock the vnode. */ if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); if (doingvmio) { bp->b_flags |= (B_VMIO | B_CACHE); #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } splx(s); allocbuf(bp, size); return (bp); } } /* * Get an empty, disassociated buffer of given size. */ struct buf * geteblk(int size) { struct buf *bp; while ((bp = getnewbuf(0, 0, 0)) == 0); allocbuf(bp, size); bp->b_flags |= B_INVAL; return (bp); } + /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * * Modify the length of a buffer's underlying buffer storage without * destroying information (unless, of course the buffer is shrinking). */ int allocbuf(struct buf * bp, int size) { int s; int newbsize, mbsize; int i; if (!(bp->b_flags & B_BUSY)) panic("allocbuf: buffer not busy"); if ((bp->b_flags & B_VMIO) == 0) { /* * Just get anonymous memory from the kernel */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); newbsize = round_page(size); if (newbsize < bp->b_bufsize) { vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); } } else { vm_page_t m; int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (round_page(newbsize) >> PAGE_SHIFT); if (newbsize < bp->b_bufsize) { if (desiredpages < bp->b_npages) { - pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + - (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); for (i = desiredpages; i < bp->b_npages; i++) { + /* + * the page is not freed here -- it + * is the responsibility of vnode_pager_setsize + */ m = bp->b_pages[i]; s = splhigh(); while ((m->flags & PG_BUSY) || (m->busy != 0)) { m->flags |= PG_WANTED; tsleep(m, PVM, "biodep", 0); } splx(s); - if (m->bmapped == 0) { - printf("allocbuf: bmapped is zero for page %d\n", i); - panic("allocbuf: error"); - } - --m->bmapped; - if (m->bmapped == 0) { - vm_page_protect(m, VM_PROT_NONE); - vm_page_free(m); - } bp->b_pages[i] = NULL; + vm_page_unwire(m); } + pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (newbsize > bp->b_bufsize) { vm_object_t obj; vm_offset_t tinc, toff; vm_ooffset_t off; vm_pindex_t objoff; int pageindex, curbpnpages; struct vnode *vp; int bsize; vp = bp->b_vp; if (vp->v_type == VBLK) bsize = DEV_BSIZE; else bsize = vp->v_mount->mnt_stat.f_iosize; if (bp->b_npages < desiredpages) { obj = vp->v_object; tinc = PAGE_SIZE; if (tinc > bsize) tinc = bsize; off = (vm_ooffset_t) bp->b_lblkno * bsize; doretry: curbpnpages = bp->b_npages; bp->b_flags |= B_CACHE; for (toff = 0; toff < newbsize; toff += tinc) { int bytesinpage; pageindex = toff >> PAGE_SHIFT; objoff = OFF_TO_IDX(off + toff); if (pageindex < curbpnpages) { m = bp->b_pages[pageindex]; +#ifdef VFS_BIO_DIAG if (m->pindex != objoff) panic("allocbuf: page changed offset??!!!?"); +#endif bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; - if (!vm_page_is_valid(m, + if ((bp->b_flags & B_CACHE) && + !vm_page_is_valid(m, (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), bytesinpage)) { bp->b_flags &= ~B_CACHE; } - if ((m->flags & PG_ACTIVE) == 0) { - vm_page_activate(m); - m->act_count = 0; - } continue; } m = vm_page_lookup(obj, objoff); if (!m) { m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); if (!m) { - int j; - - for (j = bp->b_npages; j < pageindex; j++) { - PAGE_WAKEUP(bp->b_pages[j]); - } VM_WAIT; goto doretry; } - vm_page_activate(m); - m->act_count = 0; - m->valid = 0; + /* + * Normally it is unwise to clear PG_BUSY without + * PAGE_WAKEUP -- but it is okay here, as there is + * no chance for blocking between here and vm_page_alloc + */ + m->flags &= ~PG_BUSY; + vm_page_wire(m); bp->b_flags &= ~B_CACHE; } else if (m->flags & PG_BUSY) { - int j; - for (j = bp->b_npages; j < pageindex; j++) { - PAGE_WAKEUP(bp->b_pages[j]); - } - - s = splbio(); + s = splhigh(); m->flags |= PG_WANTED; tsleep(m, PVM, "pgtblk", 0); splx(s); goto doretry; } else { if ((curproc != pageproc) && - (m->flags & PG_CACHE) && - (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { + (m->queue == PQ_CACHE) && + ((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } bytesinpage = tinc; if (tinc > (newbsize - toff)) bytesinpage = newbsize - toff; - if (!vm_page_is_valid(m, + if ((bp->b_flags & B_CACHE) && + !vm_page_is_valid(m, (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), bytesinpage)) { bp->b_flags &= ~B_CACHE; } - if ((m->flags & PG_ACTIVE) == 0) { - vm_page_activate(m); - m->act_count = 0; - } - m->flags |= PG_BUSY; + vm_page_wire(m); } bp->b_pages[pageindex] = m; curbpnpages = pageindex + 1; } - for (i = bp->b_npages; i < curbpnpages; i++) { - m = bp->b_pages[i]; - m->bmapped++; - PAGE_WAKEUP(m); - } - bp->b_npages = curbpnpages; +/* bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; - pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); - bp->b_data += off & (PAGE_SIZE - 1); +*/ + bp->b_data = (caddr_t) trunc_page(bp->b_data); + bp->b_npages = curbpnpages; + pmap_qenter((vm_offset_t) bp->b_data, + bp->b_pages, bp->b_npages); + ((vm_offset_t) bp->b_data) |= off & (PAGE_SIZE - 1); } } } bufspace += (newbsize - bp->b_bufsize); bp->b_bufsize = newbsize; bp->b_bcount = size; return 1; } /* * Wait for buffer I/O completion, returning error status. */ int biowait(register struct buf * bp) { int s; s = splbio(); while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PRIBIO, "biowait", 0); splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_flags & B_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Finish I/O on a buffer, calling an optional function. * This is usually called from interrupt level, so process blocking * is not *a good idea*. */ void biodone(register struct buf * bp) { int s; s = splbio(); if (!(bp->b_flags & B_BUSY)) panic("biodone: buffer not busy"); if (bp->b_flags & B_DONE) { splx(s); printf("biodone: buffer already done\n"); return; } bp->b_flags |= B_DONE; if ((bp->b_flags & B_READ) == 0) { vwakeup(bp); } #ifdef BOUNCE_BUFFERS if (bp->b_flags & B_BOUNCE) vm_bounce_free(bp); #endif /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (bp->b_flags & B_VMIO) { int i, resid; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; if (vp->v_type == VBLK) foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; else foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; obj = vp->v_object; if (!obj) { panic("biodone: no object"); } #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif iosize = bp->b_bufsize; for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif --obj->paging_in_progress; continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); } #endif resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iosize) resid = iosize; /* * In the write case, the valid and clean bits are * already changed correctly, so we only need to do this * here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { vm_page_set_validclean(m, (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); } /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); if (vp->v_type != VBLK) printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, (int) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", (int) bp->b_lblkno, bp->b_flags, bp->b_npages); - printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", - m->valid, m->dirty, m->bmapped); + printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", + m->valid, m->dirty, m->wire_count); panic("biodone: page busy < 0\n"); } --m->busy; if ((m->busy == 0) && (m->flags & PG_WANTED)) { m->flags &= ~PG_WANTED; wakeup(m); } --obj->paging_in_progress; foff += resid; iosize -= resid; } if (obj && obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup(obj); } } /* * For asynchronous completions, release the buffer now. The brelse * checks for B_WANTED and will do the wakeup there if necessary - so * no need to do a wakeup here in the async case. */ if (bp->b_flags & B_ASYNC) { - brelse(bp); + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) + brelse(bp); + else + bqrelse(bp); } else { wakeup(bp); } splx(s); } int count_lock_queue() { int count; struct buf *bp; count = 0; for (bp = bufqueues[QUEUE_LOCKED].tqh_first; bp != NULL; bp = bp->b_freelist.tqe_next) count++; return (count); } int vfs_update_interval = 30; static void vfs_update() { (void) spl0(); /* XXX redundant? wrong place? */ while (1) { tsleep(&vfs_update_wakeup, PUSER, "update", hz * vfs_update_interval); vfs_update_wakeup = 0; sync(curproc, NULL, NULL); } } static int sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error) wakeup(&vfs_update_wakeup); return error; } SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; vm_ooffset_t foff; foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); if (!m) { panic("vfs_unbusy_pages: page missing\n"); } bp->b_pages[i] = m; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } --obj->paging_in_progress; --m->busy; if ((m->busy == 0) && (m->flags & PG_WANTED)) { m->flags &= ~PG_WANTED; wakeup(m); } } if (obj->paging_in_progress == 0 && (obj->flags & OBJ_PIPWNT)) { obj->flags &= ~OBJ_PIPWNT; wakeup(obj); } } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i; if (bp->b_flags & B_VMIO) { vm_object_t obj = bp->b_vp->v_object; vm_ooffset_t foff; int iocount = bp->b_bufsize; if (bp->b_vp->v_type == VBLK) foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; else foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; vfs_setdirty(bp); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iocount) resid = iocount; if ((bp->b_flags & B_CLUSTER) == 0) { obj->paging_in_progress++; m->busy++; } if (clear_modify) { vm_page_protect(m, VM_PROT_READ); vm_page_set_validclean(m, (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); } else if (bp->b_bcount >= PAGE_SIZE) { if (m->valid && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } } foff += resid; iocount -= resid; } } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. */ void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; int iocount = bp->b_bufsize; if (bp->b_vp->v_type == VBLK) foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; else foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; int resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iocount) resid = iocount; if (resid > 0) { vm_page_set_validclean(m, ((vm_offset_t) foff & (PAGE_SIZE-1)), resid); } foff += resid; iocount -= resid; } } } void vfs_bio_clrbuf(struct buf *bp) { int i; + int remapbuffer = 0; if( bp->b_flags & B_VMIO) { if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { int mask; mask = 0; for(i=0;ib_bufsize;i+=DEV_BSIZE) mask |= (1 << (i/DEV_BSIZE)); if( bp->b_pages[0]->valid != mask) { bzero(bp->b_data, bp->b_bufsize); } bp->b_pages[0]->valid = mask; bp->b_resid = 0; return; } for(i=0;ib_npages;i++) { if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) continue; if( bp->b_pages[i]->valid == 0) { - if ((bp->b_pages[i]->flags & PG_ZERO) == 0) + if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); + } } else { int j; for(j=0;jb_pages[i]->valid & (1<b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); } } bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; } bp->b_resid = 0; } else { clrbuf(bp); } + if (remapbuffer) + pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; + int index; to = round_page(to); + from = round_page(from); + index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; - for (pg = round_page(from); pg < to; pg += PAGE_SIZE) { + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); if (!p) { VM_WAIT; goto tryagain; } vm_page_wire(p); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); - bp->b_pages[(pg - trunc_page(bp->b_data)) >> PAGE_SHIFT] = p; + bp->b_pages[index] = p; PAGE_WAKEUP(p); - bp->b_npages++; } + bp->b_npages = to >> PAGE_SHIFT; } void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; from = round_page(from); to = round_page(to); index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; - bp->b_pages[index] = 0; - pmap_kremove(pg); - vm_page_free(p); - --bp->b_npages; + if (p && (index < bp->b_npages)) { + if (p->busy) { + printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", + bp->b_blkno, bp->b_lblkno); + } + bp->b_pages[index] = NULL; + pmap_kremove(pg); + vm_page_unwire(p); + vm_page_free(p); + } } + bp->b_npages = from >> PAGE_SHIFT; } Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 13489) +++ head/sys/kern/vfs_cache.c (revision 13490) @@ -1,319 +1,325 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1995 * Poul-Henning Kamp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94 - * $Id: vfs_cache.c,v 1.18 1995/12/14 09:52:47 phk Exp $ + * $Id: vfs_cache.c,v 1.19 1995/12/22 15:56:35 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include +#define MAXVNODEUSE 32 + /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (that we know a name to >not< exist) * we point out entry at our own "nchENOENT", to avoid too much special * casing in the inner loops of lookup. * * For simplicity (and economy of storage), names longer than * a maximum length of NCHNAMLEN are not cached; they occur * infrequently in any case, and are almost never of interest. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ static u_long nchash; /* size of hash table */ struct nchstats nchstats; /* cache effectiveness statistics */ static struct vnode nchENOENT; /* our own "novnode" */ static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); static u_long numcache; u_long numvnodes; #ifdef NCH_STATISTICS u_long nchnbr; #define NCHNBR(ncp) (ncp)->nc_nbr = ++nchnbr; #define NCHHIT(ncp) (ncp)->nc_hits++ #else #define NCHNBR(ncp) #define NCHHIT(ncp) #endif #define PURGE(ncp) { \ LIST_REMOVE(ncp, nc_hash); \ ncp->nc_hash.le_prev = 0; \ TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); } #define TOUCH(ncp) { \ if (ncp->nc_lru.tqe_next == 0) { } else { \ TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); \ NCHNBR(ncp); } } /* * Lookup an entry in the cache * * We don't do this if the segment name is long, simply so the cache * can avoid holding long names (which would either waste space, or * add greatly to the complexity). * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. * If the lookup succeeds, the vnode is returned in *vpp, and a status * of -1 is returned. * If the lookup determines that the name does not exist (negative cacheing), * a status of ENOENT is returned. * If the lookup fails, a status of zero is returned. */ int cache_lookup(dvp, vpp, cnp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { register struct namecache *ncp,*nnp; register struct nchashhead *ncpp; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } if (cnp->cn_namelen > NCHNAMLEN) { nchstats.ncs_long++; cnp->cn_flags &= ~MAKEENTRY; return (0); } ncpp = &nchashtbl[(dvp->v_id + cnp->cn_hash) % nchash]; for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { nnp = ncp->nc_hash.le_next; /* If one of the vp's went stale, don't bother anymore. */ if ((ncp->nc_dvpid != ncp->nc_dvp->v_id) || (ncp->nc_vpid != ncp->nc_vp->v_id)) { nchstats.ncs_falsehits++; PURGE(ncp); continue; } /* Now that we know the vp's to be valid, is it ours ? */ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) goto found; /* Fanatism considered bad. */ } nchstats.ncs_miss++; return (0); found: NCHHIT(ncp); /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { nchstats.ncs_badhits++; PURGE(ncp); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp != &nchENOENT) { nchstats.ncs_goodhits++; TOUCH(ncp); *vpp = ncp->nc_vp; + if ((*vpp)->v_usage < MAXVNODEUSE) + (*vpp)->v_usage++; return (-1); } /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { nchstats.ncs_badhits++; PURGE(ncp); return (0); } /* The name does not exists */ nchstats.ncs_neghits++; TOUCH(ncp); return (ENOENT); } /* * Add an entry to the cache. */ void cache_enter(dvp, vp, cnp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; { register struct namecache *ncp; register struct nchashhead *ncpp; if (!doingcache) return; if (cnp->cn_namelen > NCHNAMLEN) { printf("cache_enter: name too long"); return; } if (numcache < numvnodes) { /* Add one more entry */ ncp = (struct namecache *) malloc((u_long)sizeof *ncp, M_CACHE, M_WAITOK); bzero((char *)ncp, sizeof *ncp); numcache++; } else if (ncp = nclruhead.tqh_first) { /* reuse an old entry */ TAILQ_REMOVE(&nclruhead, ncp, nc_lru); if (ncp->nc_hash.le_prev != 0) { LIST_REMOVE(ncp, nc_hash); ncp->nc_hash.le_prev = 0; } } else { /* give up */ return; } /* If vp is NULL this is a "negative" cache entry */ if (!vp) vp = &nchENOENT; /* fill in cache info */ ncp->nc_vp = vp; + if (vp->v_usage < MAXVNODEUSE) + ++vp->v_usage; ncp->nc_vpid = vp->v_id; ncp->nc_dvp = dvp; ncp->nc_dvpid = dvp->v_id; ncp->nc_nlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); ncpp = &nchashtbl[(dvp->v_id + cnp->cn_hash) % nchash]; LIST_INSERT_HEAD(ncpp, ncp, nc_hash); } /* * Name cache initialization, from vfs_init() when we are booting */ void nchinit() { TAILQ_INIT(&nclruhead); nchashtbl = phashinit(desiredvnodes, M_CACHE, &nchash); cache_purge(&nchENOENT); /* Initialize v_id */ } /* * Invalidate all entries to a particular vnode. * * We actually just increment the v_id, that will do it. The stale entries * will be purged by lookup as they get found. * If the v_id wraps around, we need to ditch the entire cache, to avoid * confusion. * No valid vnode will ever have (v_id == 0). */ void cache_purge(vp) struct vnode *vp; { struct nchashhead *ncpp; static u_long nextvnodeid; vp->v_id = ++nextvnodeid; if (nextvnodeid != 0) return; for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { while(ncpp->lh_first) PURGE(ncpp->lh_first); } nchENOENT.v_id = ++nextvnodeid; vp->v_id = ++nextvnodeid; } /* * Flush all entries referencing a particular filesystem. * * Since we need to check it anyway, we will flush all the invalid * entries at the same time. * * If we purge anything, we scan the hash-bucket again. There is only * a handful of entries, so it cheap and simple. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp; /* Scan hash tables for applicable entries */ for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { ncp = ncpp->lh_first; while(ncp) { if (ncp->nc_dvpid != ncp->nc_dvp->v_id || ncp->nc_vpid != ncp->nc_vp->v_id || ncp->nc_dvp->v_mount == mp) { PURGE(ncp); ncp = ncpp->lh_first; } else { ncp = ncp->nc_hash.le_next; } } } } Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 13489) +++ head/sys/kern/vfs_cluster.c (revision 13490) @@ -1,715 +1,710 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 - * $Id: vfs_cluster.c,v 1.30 1995/12/11 04:56:07 dyson Exp $ + * $Id: vfs_cluster.c,v 1.31 1995/12/22 16:06:46 bde Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef notyet_block_reallocation_enabled #ifdef DEBUG #include #include static int doreallocblks = 0; SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); #else #define doreallocblks 0 #endif #endif /* notyet_block_reallocation_enabled */ #ifdef notyet_block_reallocation_enabled static struct cluster_save * cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); #endif static struct buf * cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run)); static int totreads; static int totreadblocks; extern vm_page_t bogus_page; #ifdef DIAGNOSTIC /* * Set to 1 if reads of block zero should cause readahead to be done. * Set to 0 treats a read of block zero as a non-sequential read. * * Setting to one assumes that most reads of block zero of files are due to * sequential passes over the files (e.g. cat, sum) where additional blocks * will soon be needed. Setting to zero assumes that the majority are * surgical strikes to get particular info (e.g. size, file) where readahead * blocks will not be used and, in fact, push out other potentially useful * blocks from the cache. The former seems intuitive, but some quick tests * showed that the latter performed better from a system-wide point of view. */ int doclusterraz = 0; #define ISSEQREAD(vp, blk) \ (((blk) != 0 || doclusterraz) && \ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) #else #define ISSEQREAD(vp, blk) \ (/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) #endif /* * allow for three entire read-aheads... The system will * adjust downwards rapidly if needed... */ #define RA_MULTIPLE_FAST 2 #define RA_MULTIPLE_SLOW 3 #define RA_SHIFTDOWN 1 /* approx lg2(RA_MULTIPLE) */ /* * This replaces bread. If this is a bread at the beginning of a file and * lastr is 0, we assume this is the first read and we'll read up to two * blocks if they are sequential. After that, we'll do regular read ahead * in clustered chunks. * bp is the block requested. * rbp is the read-ahead block. * If either is NULL, then you don't have to do the I/O. */ int cluster_read(vp, filesize, lblkno, size, cred, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; struct buf **bpp; { struct buf *bp, *rbp; daddr_t blkno, rablkno, origlblkno; int error, num_ra, alreadyincore; int i; int seq; error = 0; /* * get the requested block */ origlblkno = lblkno; *bpp = bp = getblk(vp, lblkno, size, 0, 0); seq = ISSEQREAD(vp, lblkno); /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seq) { vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; vp->v_ralen >>= RA_SHIFTDOWN; return 0; } else if( vp->v_maxra > lblkno) { - if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >= - (lblkno + vp->v_ralen)) { + if ( vp->v_maxra > lblkno + (vp->v_ralen / RA_MULTIPLE_SLOW) ) { if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size)) ++vp->v_ralen; return 0; } lblkno = vp->v_maxra; } else { lblkno += 1; } bp = NULL; } else { /* * if it isn't in the cache, then get a chunk from disk if * sequential, otherwise just get the block. */ bp->b_flags |= B_READ; lblkno += 1; curproc->p_stats->p_ru.ru_inblock++; /* XXX */ vp->v_ralen = 0; } /* * assume no read-ahead */ alreadyincore = 1; rablkno = lblkno; /* * if we have been doing sequential I/O, then do some read-ahead */ if (seq) { /* * bump ralen a bit... */ if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size)) ++vp->v_ralen; /* * this code makes sure that the stuff that we have read-ahead * is still in the cache. If it isn't, we have been reading * ahead too much, and we need to back-off, otherwise we might * try to read more. */ - for (i = 0; i < vp->v_ralen; i++) { + for (i = 0; i < vp->v_maxra - lblkno; i++) { rablkno = lblkno + i; - alreadyincore = (int) gbincore(vp, rablkno); + alreadyincore = (int) incore(vp, rablkno); if (!alreadyincore) { - if (rablkno < vp->v_maxra) { - vp->v_maxra = rablkno; - vp->v_ralen >>= RA_SHIFTDOWN; - alreadyincore = 1; - } - break; - } else if (vp->v_maxra < rablkno) { - vp->v_maxra = rablkno + 1; + vp->v_maxra = rablkno; + vp->v_ralen >>= RA_SHIFTDOWN; + alreadyincore = 1; } } } /* * we now build the read-ahead buffer if it is desirable. */ rbp = NULL; if (!alreadyincore && ((u_quad_t)(rablkno + 1) * size) <= filesize && !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) && blkno != -1) { if (num_ra > vp->v_ralen) num_ra = vp->v_ralen; if (num_ra) { rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size, num_ra + 1); } else { rbp = getblk(vp, rablkno, size, 0, 0); rbp->b_flags |= B_READ | B_ASYNC; rbp->b_blkno = blkno; } } /* * handle the synchronous read */ if (bp) { if (bp->b_flags & (B_DONE | B_DELWRI)) panic("cluster_read: DONE bp"); else { vfs_busy_pages(bp, 0); error = VOP_STRATEGY(bp); vp->v_maxra = bp->b_lblkno + bp->b_bcount / size; totreads++; totreadblocks += bp->b_bcount / size; curproc->p_stats->p_ru.ru_inblock++; } } /* * and if we have read-aheads, do them too */ if (rbp) { vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size; - if (error || (rbp->b_flags & B_CACHE)) { + if (error) { rbp->b_flags &= ~(B_ASYNC | B_READ); brelse(rbp); + } else if (rbp->b_flags & B_CACHE) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + bqrelse(rbp); } else { if ((rbp->b_flags & B_CLUSTER) == 0) vfs_busy_pages(rbp, 0); (void) VOP_STRATEGY(rbp); totreads++; totreadblocks += rbp->b_bcount / size; curproc->p_stats->p_ru.ru_inblock++; } } if (bp && ((bp->b_flags & B_ASYNC) == 0)) return (biowait(bp)); return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(vp, filesize, lbn, blkno, size, run) struct vnode *vp; u_quad_t filesize; daddr_t lbn; daddr_t blkno; long size; int run; { struct buf *bp, *tbp; daddr_t bn; int i, inc, j; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_rbuild: size %d != filesize %d\n", size, vp->v_mount->mnt_stat.f_iosize); #endif /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } tbp = getblk(vp, lbn, size, 0, 0); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_blkno = blkno; tbp->b_flags |= B_ASYNC | B_READ; if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(); if (bp == 0) return tbp; (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i != 0) { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > MAXPHYS) break; - if (gbincore(vp, lbn + i)) + if (incore(vp, lbn + i)) break; tbp = getblk(vp, lbn + i, size, 0, 0); if ((tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { - brelse(tbp); + bqrelse(tbp); break; } for (j=0;jb_npages;j++) { if (tbp->b_pages[j]->valid) { break; } } if (j != tbp->b_npages) { /* * force buffer to be re-constituted later */ tbp->b_flags |= B_RELBUF; brelse(tbp); break; } tbp->b_flags |= B_READ | B_ASYNC; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { brelse(tbp); break; } } TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; ++m->busy; ++m->object->paging_in_progress; if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { m = bogus_page; } if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } bp->b_bcount += tbp->b_bcount; bp->b_bufsize += tbp->b_bufsize; } pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ void cluster_callback(bp) struct buf *bp; { struct buf *nbp, *tbp; int error = 0; /* * Must propogate errors to all the components. */ if (bp->b_flags & B_ERROR) error = bp->b_error; pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = bp->b_cluster.cluster_head.tqh_first; tbp; tbp = nbp) { nbp = tbp->b_cluster.cluster_entry.tqe_next; if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; } biodone(tbp); } relpbuf(bp); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(bp, filesize) struct buf *bp; u_quad_t filesize; { struct vnode *vp; daddr_t lbn; int maxclen, cursize; int lblocksize; int async; vp = bp->b_vp; async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)); lblocksize = vp->v_mount->mnt_stat.f_iosize; lbn = bp->b_lblkno; /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = MAXPHYS / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. */ cursize = vp->v_lastw - vp->v_cstart + 1; #ifndef notyet_block_reallocation_enabled if (((u_quad_t)(lbn + 1) * lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async) cluster_wbuild(vp, lblocksize, vp->v_cstart, cursize); } #else if (!doreallocblks || (lbn + 1) * lblocksize != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async) cluster_wbuild(vp, lblocksize, vp->v_cstart, cursize); } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); cluster_wbuild(vp, lblocksize, vp->v_cstart, cursize); } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } #endif /* notyet_block_reallocation_enabled */ } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if (((u_quad_t) (lbn + 1) * lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; - if (!async) - bawrite(bp); - else - bdwrite(bp); + bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out. */ bdwrite(bp); - cluster_wbuild(vp, lblocksize, vp->v_cstart, - vp->v_clen + 1); + cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; } else /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(vp, size, start_lbn, len) struct vnode *vp; long size; daddr_t start_lbn; int len; { struct buf *bp, *tbp; int i, j, s; int totalwritten = 0; int dbsize = btodb(size); while (len > 0) { s = splbio(); if ( ((tbp = gbincore(vp, start_lbn)) == NULL) || ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { ++start_lbn; --len; splx(s); continue; } bremfree(tbp); tbp->b_flags |= B_BUSY; tbp->b_flags &= ~B_DONE; splx(s); /* * Extra memory in the buffer, punt on this buffer. XXX we could * handle this in most cases, but we would have to push the extra * memory down to after our max possible cluster size and then * potentially pull it back up if the cluster was terminated * prematurely--too much hassle. */ if (((tbp->b_flags & B_CLUSTEROK) != B_CLUSTEROK) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || len == 1) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } bp = trypbuf(); if (bp == NULL) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & B_VMIO); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { s = splbio(); if ((tbp = gbincore(vp, start_lbn)) == NULL) { splx(s); break; } if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & B_VMIO))) { splx(s); break; } if ((tbp->b_bcount != size) || ((bp->b_blkno + dbsize * i) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) { splx(s); break; } bremfree(tbp); tbp->b_flags |= B_BUSY; tbp->b_flags &= ~B_DONE; splx(s); } - for (j = 0; j < tbp->b_npages; j += 1) { - vm_page_t m; - m = tbp->b_pages[j]; - ++m->busy; - ++m->object->paging_in_progress; - if ((bp->b_npages == 0) || - (bp->b_pages[bp->b_npages - 1] != m)) { - bp->b_pages[bp->b_npages] = m; - bp->b_npages++; + if (tbp->b_flags & B_VMIO) { + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + ++m->busy; + ++m->object->paging_in_progress; + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } } } bp->b_bcount += size; bp->b_bufsize += size; tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); tbp->b_flags |= B_ASYNC; s = splbio(); reassignbuf(tbp, tbp->b_vp); /* put on clean list */ ++tbp->b_vp->v_numoutput; splx(s); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *) bp->b_pages, bp->b_npages); totalwritten += bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } #ifdef notyet_block_reallocation_enabled /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(vp, last_bp) struct vnode *vp; struct buf *last_bp; { struct cluster_save *buflist; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &buflist->bs_children[i]); buflist->bs_children[i] = last_bp; buflist->bs_nchildren = i + 1; return (buflist); } #endif /* notyet_block_reallocation_enabled */ Index: head/sys/kern/vfs_export.c =================================================================== --- head/sys/kern/vfs_export.c (revision 13489) +++ head/sys/kern/vfs_export.c (revision 13490) @@ -1,1538 +1,1547 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 - * $Id: vfs_subr.c,v 1.50 1996/01/02 18:13:20 davidg Exp $ + * $Id: vfs_subr.c,v 1.51 1996/01/04 21:12:26 wollman Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern void printlockedvnodes __P((void)); #endif extern void vclean __P((struct vnode *vp, int flags)); extern void vfs_unmountroot __P((struct mount *rootfs)); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * Insq/Remq for the vnode usage lists. */ #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) #define bufremvn(bp) { \ LIST_REMOVE(bp, b_vnbufs); \ (bp)->b_vnbufs.le_next = NOLIST; \ } TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ u_long freevnodes = 0; struct mntlist mountlist; /* mounted filesystem list */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RD, &desiredvnodes, 0, ""); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + vm_object_cache_max; TAILQ_INIT(&vnode_free_list); CIRCLEQ_INIT(&mountlist); } /* * Lock a filesystem. * Used to prevent access to it while mounting and unmounting. */ int vfs_lock(mp) register struct mount *mp; { while (mp->mnt_flag & MNT_MLOCK) { mp->mnt_flag |= MNT_MWAIT; (void) tsleep((caddr_t) mp, PVFS, "vfslck", 0); } mp->mnt_flag |= MNT_MLOCK; return (0); } /* * Unlock a locked filesystem. * Panic if filesystem is not locked. */ void vfs_unlock(mp) register struct mount *mp; { if ((mp->mnt_flag & MNT_MLOCK) == 0) panic("vfs_unlock: not locked"); mp->mnt_flag &= ~MNT_MLOCK; if (mp->mnt_flag & MNT_MWAIT) { mp->mnt_flag &= ~MNT_MWAIT; wakeup((caddr_t) mp); } } /* * Mark a mount point as busy. * Used to synchronize access and to delay unmounting. */ int vfs_busy(mp) register struct mount *mp; { while (mp->mnt_flag & MNT_MPBUSY) { mp->mnt_flag |= MNT_MPWANT; (void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0); } if (mp->mnt_flag & MNT_UNMOUNT) return (1); mp->mnt_flag |= MNT_MPBUSY; return (0); } /* * Free a busy filesystem. * Panic if filesystem is not busy. */ void vfs_unbusy(mp) register struct mount *mp; { if ((mp->mnt_flag & MNT_MPBUSY) == 0) panic("vfs_unbusy: not busy"); mp->mnt_flag &= ~MNT_MPBUSY; if (mp->mnt_flag & MNT_MPWANT) { mp->mnt_flag &= ~MNT_MPWANT; wakeup((caddr_t) &mp->mnt_flag); } } void vfs_unmountroot(struct mount *rootfs) { struct mount *mp = rootfs; int error; if (vfs_busy(mp)) { printf("failed to unmount root\n"); return; } mp->mnt_flag |= MNT_UNMOUNT; if ((error = vfs_lock(mp))) { printf("lock of root filesystem failed (%d)\n", error); return; } vnode_pager_umount(mp); /* release cached vnodes */ cache_purgevfs(mp); /* remove cache entries for this file sys */ if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc))) printf("sync of root filesystem failed (%d)\n", error); if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) { printf("unmount of root filesystem failed ("); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } mp->mnt_flag &= ~MNT_UNMOUNT; vfs_unbusy(mp); } /* * Unmount all filesystems. Should only be called by halt(). */ void vfs_unmountall() { struct mount *mp, *nmp, *rootfs = NULL; int error; /* unmount all but rootfs */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; if (mp->mnt_flag & MNT_ROOTFS) { rootfs = mp; continue; } error = dounmount(mp, MNT_FORCE, initproc); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* and finally... */ if (rootfs) { vfs_unmountroot(rootfs); } else { printf("no root filesystem\n"); } } /* * Lookup a mount point by filesystem identifier. */ struct mount * getvfs(fsid) fsid_t *fsid; { register struct mount *mp; for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) return (mp); } return ((struct mount *) 0); } /* * Get a new unique fsid */ void getnewfsid(mp, mtype) struct mount *mp; int mtype; { static u_short xxxfs_mntid; fsid_t tfsid; mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (getvfs(&tfsid)) { tfsid.val[0]++; xxxfs_mntid++; } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = vap->va_fsid = vap->va_fileid = vap->va_blocksize = vap->va_rdev = vap->va_atime.ts_sec = vap->va_atime.ts_nsec = vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = vap->va_flags = vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { register struct vnode *vp; +retry: vp = vnode_free_list.tqh_first; /* * we allocate a new vnode if * 1. we don't have any free * Pretty obvious, we actually used to panic, but that * is a silly thing to do. * 2. we havn't filled our pool yet * We don't want to trash the incore (VM-)vnodecache. * 3. if less that 1/4th of our vnodes are free. * We don't want to trash the namei cache either. */ if (freevnodes < (numvnodes >> 2) || numvnodes < desiredvnodes || vp == NULL) { vp = (struct vnode *) malloc((u_long) sizeof *vp, M_VNODE, M_WAITOK); bzero((char *) vp, sizeof *vp); numvnodes++; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + if (vp->v_usage > 0) { + --vp->v_usage; + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + goto retry; + } freevnodes--; - if (vp->v_usecount) - panic("free vnode isn't"); - /* see comment on why 0xdeadb is set at end of vgone (below) */ vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb; vp->v_lease = NULL; if (vp->v_type != VBAD) vgone(vp); + if (vp->v_usecount) + panic("free vnode isn't"); + #ifdef DIAGNOSTIC { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_ralen = 0; vp->v_maxra = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ + vp->v_usage = 0; } vp->v_type = VNON; cache_purge(vp); vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; return (0); } /* * Move a vnode from one mount queue to another. */ void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) return; LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p))) return (error); if (vp->v_dirtyblkhd.lh_first != NULL) panic("vinvalbuf: dirty bufs"); } for (;;) { if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) continue; s = splbio(); if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; error = tsleep((caddr_t) bp, slpflag | (PRIBIO + 1), "vinvalbuf", slptimeo); splx(s); if (error) return (error); break; } bremfree(bp); bp->b_flags |= B_BUSY; splx(s); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { (void) VOP_BWRITE(bp); break; } bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF); brelse(bp); } } s = splbio(); while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, object->size, (flags & V_SAVE) ? TRUE : FALSE); } if (!(flags & V_SAVEMETA) && (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) panic("vinvalbuf: flush failed"); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; if (bp->b_vp) panic("bgetvp: not free"); VHOLD(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bufinsvn(bp, &vp->v_cleanblkhd); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; int s; if (bp->b_vp == (struct vnode *) 0) panic("brelvp: NULL"); /* * Delete from old vnode list, if on one. */ s = splbio(); if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); splx(s); vp = bp->b_vp; bp->b_vp = (struct vnode *) 0; HOLDRELE(vp); } /* * Associate a p-buffer with a vnode. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { if (bp->b_vp) panic("pbgetvp: not free"); VHOLD(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { struct vnode *vp; if (bp->b_vp == (struct vnode *) 0) panic("brelvp: NULL"); vp = bp->b_vp; bp->b_vp = (struct vnode *) 0; HOLDRELE(vp); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { register struct buflists *listheadp; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } /* * Delete from old vnode list, if on one. */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; tbp = newvp->v_dirtyblkhd.lh_first; if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) { bufinsvn(bp, &newvp->v_dirtyblkhd); } else { - while (tbp->b_vnbufs.le_next && (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { + while (tbp->b_vnbufs.le_next && + (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { tbp = tbp->b_vnbufs.le_next; } LIST_INSERT_AFTER(tbp, bp, b_vnbufs); } } else { listheadp = &newvp->v_cleanblkhd; bufinsvn(bp, listheadp); } } /* * Create a vnode for a block device. * Used for root filesystem, argdev, and swap areas. * Also used for memory file system special devices. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) return (0); error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); if (error) { *vpp = 0; return (error); } vp = nvp; vp->v_type = VBLK; if ((nvp = checkalias(vp, dev, (struct mount *) 0))) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; dev_t nvp_rdev; struct mount *mp; { register struct vnode *vp; struct vnode **vpp; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); vpp = &speclisth[SPECHASH(nvp_rdev)]; loop: for (vp = *vpp; vp; vp = vp->v_specnext) { if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vp->v_usecount == 0) { vgone(vp); goto loop; } if (vget(vp, 1)) goto loop; break; } if (vp == NULL || vp->v_tag != VT_NON) { MALLOC(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo), M_VNODE, M_WAITOK); nvp->v_rdev = nvp_rdev; nvp->v_hashchain = vpp; nvp->v_specnext = *vpp; nvp->v_specflags = 0; *vpp = nvp; if (vp != NULL) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } VOP_UNLOCK(vp); vclean(vp, 0); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, lockflag) register struct vnode *vp; int lockflag; { /* * If the vnode is in the process of being cleaned out for another * use, we wait for the cleaning to finish and then return failure. * Cleaning is determined either by checking that the VXLOCK flag is * set, or that the use count is zero with the back pointer set to * show that it has been removed from the free list by getnewvnode. * The VXLOCK flag may not have been set yet because vclean is blocked * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete. */ if ((vp->v_flag & VXLOCK) || (vp->v_usecount == 0 && vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vget", 0); return (1); } if (vp->v_usecount == 0) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } vp->v_usecount++; if (lockflag) VOP_LOCK(vp); return (0); } /* * Vnode reference, just increment the count */ void vref(vp) struct vnode *vp; { if (vp->v_usecount <= 0) panic("vref used where vget required"); vp->v_usecount++; } /* * vput(), just unlock and vrele() */ void vput(vp) register struct vnode *vp; { VOP_UNLOCK(vp); vrele(vp); } /* * Vnode release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) register struct vnode *vp; { #ifdef DIAGNOSTIC if (vp == NULL) panic("vrele: null vp"); #endif vp->v_usecount--; if (vp->v_usecount > 0) return; if (vp->v_usecount < 0 /* || vp->v_writecount < 0 */ ) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif panic("vrele: negative reference cnt"); } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); vp->v_flag &= ~VAGE; + vp->v_usage = 0; } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; VOP_INACTIVE(vp); } #ifdef DIAGNOSTIC /* * Page or buffer structure gets a reference. */ void vhold(vp) register struct vnode *vp; { vp->v_holdcnt++; } /* * Page or buffer structure frees a reference. */ void holdrele(vp) register struct vnode *vp; { if (vp->v_holdcnt <= 0) panic("holdrele: holdcnt"); vp->v_holdcnt--; } #endif /* DIAGNOSTIC */ /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { register struct vnode *vp, *nvp; int busy = 0; if ((mp->mnt_flag & MNT_MPBUSY) == 0) panic("vflush: not busy"); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) continue; /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) continue; /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { vgone(vp); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { if (vp->v_type != VBLK && vp->v_type != VCHR) { vgone(vp); } else { vclean(vp, 0); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif busy++; } if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ void vclean(struct vnode *vp, int flags) { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) VREF(vp); /* * Even if the count is zero, the VOP_INACTIVE routine may still have * the object locked while it cleans it out. The VOP_LOCK ensures that * the VOP_INACTIVE routine is done with its work. For active vnodes, * it ensures that no other activity can occur while the underlying * object is being cleaned out. */ VOP_LOCK(vp); /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Clean out any buffers associated with the vnode. */ if (flags & DOCLOSE) vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); /* * Any other processes trying to obtain this lock must first wait for * VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp); /* * If purging an active vnode, it must be closed and deactivated * before being reclaimed. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); VOP_INACTIVE(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vclean: cannot reclaim"); if (active) vrele(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ void vgoneall(vp) register struct vnode *vp; { register struct vnode *vq; if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, wait until * it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vgall", 0); return; } /* * Ensure that vp will not be vgone'd while we are eliminating * its aliases. */ vp->v_flag |= VXLOCK; while (vp->v_flag & VALIASED) { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq) continue; vgone(vq); break; } } /* * Remove the lock so that vgone below will really eliminate * the vnode after which time vgone will awaken any sleepers. */ vp->v_flag &= ~VXLOCK; } vgone(vp); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { register struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, wait until it is * done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) { LIST_REMOVE(vp, v_mntvnodes); vp->v_mount = NULL; } /* * If special device, remove it from special device alias list. */ if (vp->v_type == VBLK || vp->v_type == VCHR) { if (*vp->v_hashchain == vp) { *vp->v_hashchain = vp->v_specnext; } else { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } FREE(vp->v_specinfo, M_VNODE); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, move it to * the head of the list. The test of the back pointer and the * reference count of zero is because it will be removed from the free * list by getnewvnode, but will not have its reference count * incremented until after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to close the * previous instance of the underlying object. So, the back pointer is * explicitly set to `0xdeadb' in getnewvnode after removing it from * the freelist to ensure that we do not try to move it here. */ if (vp->v_usecount == 0 && vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb && vnode_free_list.tqh_first != vp) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } vp->v_type = VBAD; } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || type != vp->v_type) continue; *vpp = vp; return (1); } return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { register struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { vgone(vq); goto loop; } count += vq->v_usecount; } return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[64]; if (label != NULL) printf("%s: ", label); printf("type %s, usecount %d, writecount %d, refcount %ld,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ void printlockedvnodes(void) { register struct mount *mp; register struct vnode *vp; printf("Locked vnodes\n"); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) if (VOP_ISLOCKED(vp)) vprint((char *) 0, vp); } } #endif int kinfo_vdebug = 1; int kinfo_vgetfailed; #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { register struct mount *mp, *nmp; struct vnode *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_next; if (vfs_busy(mp)) continue; again: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { /* * Check that the vp is still associated with this * filesystem. RACE: could have been recycled onto * the same filesystem. */ if (vp->v_mount != mp) { if (kinfo_vdebug) printf("kinfo: vp changed\n"); goto again; } if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) { vfs_unbusy(mp); return (error); } } vfs_unbusy(mp); } return (0); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) register struct vnode *vp; { register struct vnode *vq; if (vp->v_specflags & SI_MOUNTEDON) return (EBUSY); if (vp->v_flag & VALIASED) { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vq->v_specflags & SI_MOUNTEDON) return (EBUSY); } } return (0); } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, struct export_args *argp) { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(struct radix_node *rn, void *w) { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(struct netexport *nep) { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct mbuf *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = mtod(nam, struct sockaddr *); rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr) ((caddr_t) saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT)) continue; if (vp->v_object && (((vm_object_t) vp->v_object)->flags & OBJ_MIGHTBEDIRTY)) { vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE); } } } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 13489) +++ head/sys/kern/vfs_subr.c (revision 13490) @@ -1,1538 +1,1547 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 - * $Id: vfs_subr.c,v 1.50 1996/01/02 18:13:20 davidg Exp $ + * $Id: vfs_subr.c,v 1.51 1996/01/04 21:12:26 wollman Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern void printlockedvnodes __P((void)); #endif extern void vclean __P((struct vnode *vp, int flags)); extern void vfs_unmountroot __P((struct mount *rootfs)); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; /* * Insq/Remq for the vnode usage lists. */ #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) #define bufremvn(bp) { \ LIST_REMOVE(bp, b_vnbufs); \ (bp)->b_vnbufs.le_next = NOLIST; \ } TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ u_long freevnodes = 0; struct mntlist mountlist; /* mounted filesystem list */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RD, &desiredvnodes, 0, ""); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + vm_object_cache_max; TAILQ_INIT(&vnode_free_list); CIRCLEQ_INIT(&mountlist); } /* * Lock a filesystem. * Used to prevent access to it while mounting and unmounting. */ int vfs_lock(mp) register struct mount *mp; { while (mp->mnt_flag & MNT_MLOCK) { mp->mnt_flag |= MNT_MWAIT; (void) tsleep((caddr_t) mp, PVFS, "vfslck", 0); } mp->mnt_flag |= MNT_MLOCK; return (0); } /* * Unlock a locked filesystem. * Panic if filesystem is not locked. */ void vfs_unlock(mp) register struct mount *mp; { if ((mp->mnt_flag & MNT_MLOCK) == 0) panic("vfs_unlock: not locked"); mp->mnt_flag &= ~MNT_MLOCK; if (mp->mnt_flag & MNT_MWAIT) { mp->mnt_flag &= ~MNT_MWAIT; wakeup((caddr_t) mp); } } /* * Mark a mount point as busy. * Used to synchronize access and to delay unmounting. */ int vfs_busy(mp) register struct mount *mp; { while (mp->mnt_flag & MNT_MPBUSY) { mp->mnt_flag |= MNT_MPWANT; (void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0); } if (mp->mnt_flag & MNT_UNMOUNT) return (1); mp->mnt_flag |= MNT_MPBUSY; return (0); } /* * Free a busy filesystem. * Panic if filesystem is not busy. */ void vfs_unbusy(mp) register struct mount *mp; { if ((mp->mnt_flag & MNT_MPBUSY) == 0) panic("vfs_unbusy: not busy"); mp->mnt_flag &= ~MNT_MPBUSY; if (mp->mnt_flag & MNT_MPWANT) { mp->mnt_flag &= ~MNT_MPWANT; wakeup((caddr_t) &mp->mnt_flag); } } void vfs_unmountroot(struct mount *rootfs) { struct mount *mp = rootfs; int error; if (vfs_busy(mp)) { printf("failed to unmount root\n"); return; } mp->mnt_flag |= MNT_UNMOUNT; if ((error = vfs_lock(mp))) { printf("lock of root filesystem failed (%d)\n", error); return; } vnode_pager_umount(mp); /* release cached vnodes */ cache_purgevfs(mp); /* remove cache entries for this file sys */ if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc))) printf("sync of root filesystem failed (%d)\n", error); if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) { printf("unmount of root filesystem failed ("); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } mp->mnt_flag &= ~MNT_UNMOUNT; vfs_unbusy(mp); } /* * Unmount all filesystems. Should only be called by halt(). */ void vfs_unmountall() { struct mount *mp, *nmp, *rootfs = NULL; int error; /* unmount all but rootfs */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; if (mp->mnt_flag & MNT_ROOTFS) { rootfs = mp; continue; } error = dounmount(mp, MNT_FORCE, initproc); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* and finally... */ if (rootfs) { vfs_unmountroot(rootfs); } else { printf("no root filesystem\n"); } } /* * Lookup a mount point by filesystem identifier. */ struct mount * getvfs(fsid) fsid_t *fsid; { register struct mount *mp; for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) return (mp); } return ((struct mount *) 0); } /* * Get a new unique fsid */ void getnewfsid(mp, mtype) struct mount *mp; int mtype; { static u_short xxxfs_mntid; fsid_t tfsid; mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (getvfs(&tfsid)) { tfsid.val[0]++; xxxfs_mntid++; } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = vap->va_fsid = vap->va_fileid = vap->va_blocksize = vap->va_rdev = vap->va_atime.ts_sec = vap->va_atime.ts_nsec = vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = vap->va_flags = vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { register struct vnode *vp; +retry: vp = vnode_free_list.tqh_first; /* * we allocate a new vnode if * 1. we don't have any free * Pretty obvious, we actually used to panic, but that * is a silly thing to do. * 2. we havn't filled our pool yet * We don't want to trash the incore (VM-)vnodecache. * 3. if less that 1/4th of our vnodes are free. * We don't want to trash the namei cache either. */ if (freevnodes < (numvnodes >> 2) || numvnodes < desiredvnodes || vp == NULL) { vp = (struct vnode *) malloc((u_long) sizeof *vp, M_VNODE, M_WAITOK); bzero((char *) vp, sizeof *vp); numvnodes++; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + if (vp->v_usage > 0) { + --vp->v_usage; + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + goto retry; + } freevnodes--; - if (vp->v_usecount) - panic("free vnode isn't"); - /* see comment on why 0xdeadb is set at end of vgone (below) */ vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb; vp->v_lease = NULL; if (vp->v_type != VBAD) vgone(vp); + if (vp->v_usecount) + panic("free vnode isn't"); + #ifdef DIAGNOSTIC { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_ralen = 0; vp->v_maxra = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ + vp->v_usage = 0; } vp->v_type = VNON; cache_purge(vp); vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; return (0); } /* * Move a vnode from one mount queue to another. */ void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) return; LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p))) return (error); if (vp->v_dirtyblkhd.lh_first != NULL) panic("vinvalbuf: dirty bufs"); } for (;;) { if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) continue; s = splbio(); if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; error = tsleep((caddr_t) bp, slpflag | (PRIBIO + 1), "vinvalbuf", slptimeo); splx(s); if (error) return (error); break; } bremfree(bp); bp->b_flags |= B_BUSY; splx(s); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { (void) VOP_BWRITE(bp); break; } bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF); brelse(bp); } } s = splbio(); while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, object->size, (flags & V_SAVE) ? TRUE : FALSE); } if (!(flags & V_SAVEMETA) && (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) panic("vinvalbuf: flush failed"); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; if (bp->b_vp) panic("bgetvp: not free"); VHOLD(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bufinsvn(bp, &vp->v_cleanblkhd); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; int s; if (bp->b_vp == (struct vnode *) 0) panic("brelvp: NULL"); /* * Delete from old vnode list, if on one. */ s = splbio(); if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); splx(s); vp = bp->b_vp; bp->b_vp = (struct vnode *) 0; HOLDRELE(vp); } /* * Associate a p-buffer with a vnode. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { if (bp->b_vp) panic("pbgetvp: not free"); VHOLD(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { struct vnode *vp; if (bp->b_vp == (struct vnode *) 0) panic("brelvp: NULL"); vp = bp->b_vp; bp->b_vp = (struct vnode *) 0; HOLDRELE(vp); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { register struct buflists *listheadp; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } /* * Delete from old vnode list, if on one. */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; tbp = newvp->v_dirtyblkhd.lh_first; if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) { bufinsvn(bp, &newvp->v_dirtyblkhd); } else { - while (tbp->b_vnbufs.le_next && (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { + while (tbp->b_vnbufs.le_next && + (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { tbp = tbp->b_vnbufs.le_next; } LIST_INSERT_AFTER(tbp, bp, b_vnbufs); } } else { listheadp = &newvp->v_cleanblkhd; bufinsvn(bp, listheadp); } } /* * Create a vnode for a block device. * Used for root filesystem, argdev, and swap areas. * Also used for memory file system special devices. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) return (0); error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); if (error) { *vpp = 0; return (error); } vp = nvp; vp->v_type = VBLK; if ((nvp = checkalias(vp, dev, (struct mount *) 0))) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; dev_t nvp_rdev; struct mount *mp; { register struct vnode *vp; struct vnode **vpp; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); vpp = &speclisth[SPECHASH(nvp_rdev)]; loop: for (vp = *vpp; vp; vp = vp->v_specnext) { if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vp->v_usecount == 0) { vgone(vp); goto loop; } if (vget(vp, 1)) goto loop; break; } if (vp == NULL || vp->v_tag != VT_NON) { MALLOC(nvp->v_specinfo, struct specinfo *, sizeof(struct specinfo), M_VNODE, M_WAITOK); nvp->v_rdev = nvp_rdev; nvp->v_hashchain = vpp; nvp->v_specnext = *vpp; nvp->v_specflags = 0; *vpp = nvp; if (vp != NULL) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } VOP_UNLOCK(vp); vclean(vp, 0); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, lockflag) register struct vnode *vp; int lockflag; { /* * If the vnode is in the process of being cleaned out for another * use, we wait for the cleaning to finish and then return failure. * Cleaning is determined either by checking that the VXLOCK flag is * set, or that the use count is zero with the back pointer set to * show that it has been removed from the free list by getnewvnode. * The VXLOCK flag may not have been set yet because vclean is blocked * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete. */ if ((vp->v_flag & VXLOCK) || (vp->v_usecount == 0 && vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vget", 0); return (1); } if (vp->v_usecount == 0) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } vp->v_usecount++; if (lockflag) VOP_LOCK(vp); return (0); } /* * Vnode reference, just increment the count */ void vref(vp) struct vnode *vp; { if (vp->v_usecount <= 0) panic("vref used where vget required"); vp->v_usecount++; } /* * vput(), just unlock and vrele() */ void vput(vp) register struct vnode *vp; { VOP_UNLOCK(vp); vrele(vp); } /* * Vnode release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) register struct vnode *vp; { #ifdef DIAGNOSTIC if (vp == NULL) panic("vrele: null vp"); #endif vp->v_usecount--; if (vp->v_usecount > 0) return; if (vp->v_usecount < 0 /* || vp->v_writecount < 0 */ ) { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); #endif panic("vrele: negative reference cnt"); } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); vp->v_flag &= ~VAGE; + vp->v_usage = 0; } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; VOP_INACTIVE(vp); } #ifdef DIAGNOSTIC /* * Page or buffer structure gets a reference. */ void vhold(vp) register struct vnode *vp; { vp->v_holdcnt++; } /* * Page or buffer structure frees a reference. */ void holdrele(vp) register struct vnode *vp; { if (vp->v_holdcnt <= 0) panic("holdrele: holdcnt"); vp->v_holdcnt--; } #endif /* DIAGNOSTIC */ /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { register struct vnode *vp, *nvp; int busy = 0; if ((mp->mnt_flag & MNT_MPBUSY) == 0) panic("vflush: not busy"); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) continue; /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) continue; /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { vgone(vp); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { if (vp->v_type != VBLK && vp->v_type != VCHR) { vgone(vp); } else { vclean(vp, 0); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif busy++; } if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ void vclean(struct vnode *vp, int flags) { int active; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) VREF(vp); /* * Even if the count is zero, the VOP_INACTIVE routine may still have * the object locked while it cleans it out. The VOP_LOCK ensures that * the VOP_INACTIVE routine is done with its work. For active vnodes, * it ensures that no other activity can occur while the underlying * object is being cleaned out. */ VOP_LOCK(vp); /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Clean out any buffers associated with the vnode. */ if (flags & DOCLOSE) vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); /* * Any other processes trying to obtain this lock must first wait for * VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp); /* * If purging an active vnode, it must be closed and deactivated * before being reclaimed. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL); VOP_INACTIVE(vp); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vclean: cannot reclaim"); if (active) vrele(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ void vgoneall(vp) register struct vnode *vp; { register struct vnode *vq; if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, wait until * it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vgall", 0); return; } /* * Ensure that vp will not be vgone'd while we are eliminating * its aliases. */ vp->v_flag |= VXLOCK; while (vp->v_flag & VALIASED) { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq) continue; vgone(vq); break; } } /* * Remove the lock so that vgone below will really eliminate * the vnode after which time vgone will awaken any sleepers. */ vp->v_flag &= ~VXLOCK; } vgone(vp); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { register struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, wait until it is * done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t) vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) { LIST_REMOVE(vp, v_mntvnodes); vp->v_mount = NULL; } /* * If special device, remove it from special device alias list. */ if (vp->v_type == VBLK || vp->v_type == VCHR) { if (*vp->v_hashchain == vp) { *vp->v_hashchain = vp->v_specnext; } else { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } FREE(vp->v_specinfo, M_VNODE); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, move it to * the head of the list. The test of the back pointer and the * reference count of zero is because it will be removed from the free * list by getnewvnode, but will not have its reference count * incremented until after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to close the * previous instance of the underlying object. So, the back pointer is * explicitly set to `0xdeadb' in getnewvnode after removing it from * the freelist to ensure that we do not try to move it here. */ if (vp->v_usecount == 0 && vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb && vnode_free_list.tqh_first != vp) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } vp->v_type = VBAD; } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || type != vp->v_type) continue; *vpp = vp; return (1); } return (0); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { register struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { vgone(vq); goto loop; } count += vq->v_usecount; } return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[64]; if (label != NULL) printf("%s: ", label); printf("type %s, usecount %d, writecount %d, refcount %ld,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ void printlockedvnodes(void) { register struct mount *mp; register struct vnode *vp; printf("Locked vnodes\n"); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) if (VOP_ISLOCKED(vp)) vprint((char *) 0, vp); } } #endif int kinfo_vdebug = 1; int kinfo_vgetfailed; #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { register struct mount *mp, *nmp; struct vnode *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_next; if (vfs_busy(mp)) continue; again: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { /* * Check that the vp is still associated with this * filesystem. RACE: could have been recycled onto * the same filesystem. */ if (vp->v_mount != mp) { if (kinfo_vdebug) printf("kinfo: vp changed\n"); goto again; } if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) { vfs_unbusy(mp); return (error); } } vfs_unbusy(mp); } return (0); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) register struct vnode *vp; { register struct vnode *vq; if (vp->v_specflags & SI_MOUNTEDON) return (EBUSY); if (vp->v_flag & VALIASED) { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vq->v_specflags & SI_MOUNTEDON) return (EBUSY); } } return (0); } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, struct export_args *argp) { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(struct radix_node *rn, void *w) { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(struct netexport *nep) { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct mbuf *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = mtod(nam, struct sockaddr *); rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr) ((caddr_t) saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT)) continue; if (vp->v_object && (((vm_object_t) vp->v_object)->flags & OBJ_MIGHTBEDIRTY)) { vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE); } } } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 13489) +++ head/sys/kern/vfs_vnops.c (revision 13490) @@ -1,518 +1,518 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 - * $Id: vfs_vnops.c,v 1.21 1995/12/11 04:56:13 dyson Exp $ + * $Id: vfs_vnops.c,v 1.22 1995/12/17 21:23:24 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vn_closefile __P((struct file *fp, struct proc *p)); static int vn_ioctl __P((struct file *fp, int com, caddr_t data, struct proc *p)); static int vn_read __P((struct file *fp, struct uio *uio, struct ucred *cred)); static int vn_select __P((struct file *fp, int which, struct proc *p)); static int vn_vmio_open __P((struct vnode *vp, struct proc *p, struct ucred *cred)); static int vn_write __P((struct file *fp, struct uio *uio, struct ucred *cred)); struct fileops vnops = { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile }; /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. */ int vn_open(ndp, fmode, cmode) register struct nameidata *ndp; int fmode, cmode; { register struct vnode *vp; register struct proc *p = ndp->ni_cnd.cn_proc; register struct ucred *cred = p->p_ucred; struct vattr vat; struct vattr *vap = &vat; int error; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; if ((fmode & O_EXCL) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; error = namei(ndp); if (error) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; LEASE_CHECK(ndp->ni_dvp, p, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); if (error) return (error); fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF; error = namei(ndp); if (error) return (error); vp = ndp->ni_vp; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } if ((fmode & O_CREAT) == 0) { if (fmode & FREAD) { error = VOP_ACCESS(vp, VREAD, cred, p); if (error) goto bad; } if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; error = VOP_ACCESS(vp, VWRITE, cred, p); if (error) goto bad; } } if (fmode & O_TRUNC) { VOP_UNLOCK(vp); /* XXX */ LEASE_CHECK(vp, p, cred, LEASE_WRITE); VOP_LOCK(vp); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, cred, p); if (error) goto bad; } error = VOP_OPEN(vp, fmode, cred, p); if (error) goto bad; /* * this is here for VMIO support */ if (vp->v_type == VREG) { if ((error = vn_vmio_open(vp, p, cred)) != 0) goto bad; } if (fmode & FWRITE) vp->v_writecount++; return (0); bad: vput(vp); return (error); } /* * Check for write permissions on the specified vnode. * The read-only status of the file system is checked. * Also, prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (vp->v_flag & VTEXT) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, cred, p) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; { int error; if (flags & FWRITE) vp->v_writecount--; error = VOP_CLOSE(vp, flags, cred, p); vn_vmio_close(vp); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) enum uio_rw rw; struct vnode *vp; caddr_t base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *cred; int *aresid; struct proc *p; { struct uio auio; struct iovec aiov; int error; if ((ioflg & IO_NODELOCKED) == 0) VOP_LOCK(vp); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_procp = p; if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else { error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp); return (error); } /* * File table vnode read routine. */ static int vn_read(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { register struct vnode *vp = (struct vnode *)fp->f_data; int count, error; LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_READ); VOP_LOCK(vp); uio->uio_offset = fp->f_offset; count = uio->uio_resid; error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, cred); fp->f_offset += count - uio->uio_resid; VOP_UNLOCK(vp); return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, cred) struct file *fp; struct uio *uio; struct ucred *cred; { register struct vnode *vp = (struct vnode *)fp->f_data; int count, error, ioflag = 0; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_WRITE); VOP_LOCK(vp); uio->uio_offset = fp->f_offset; count = uio->uio_resid; error = VOP_WRITE(vp, uio, ioflag, cred); if (ioflag & IO_APPEND) fp->f_offset = uio->uio_offset; else fp->f_offset += count - uio->uio_resid; VOP_UNLOCK(vp); return (error); } /* * File table vnode stat routine. */ int vn_stat(vp, sb, p) struct vnode *vp; register struct stat *sb; struct proc *p; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; vap = &vattr; error = VOP_GETATTR(vp, vap, p->p_ucred, p); if (error) return (error); /* * Copy from vattr table */ sb->st_dev = vap->va_fsid; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vp->v_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; sb->st_size = vap->va_size; sb->st_atimespec = vap->va_atime; sb->st_mtimespec= vap->va_mtime; sb->st_ctimespec = vap->va_ctime; sb->st_blksize = vap->va_blocksize; sb->st_flags = vap->va_flags; sb->st_gen = vap->va_gen; #if (S_BLKSIZE == 512) /* Optimize this case */ sb->st_blocks = vap->va_bytes >> 9; #else sb->st_blocks = vap->va_bytes / S_BLKSIZE; #endif return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, p) struct file *fp; int com; caddr_t data; struct proc *p; { register struct vnode *vp = ((struct vnode *)fp->f_data); struct vattr vattr; int error; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); if (error) return (error); *(int *)data = vattr.va_size - fp->f_offset; return (0); } if (com == FIONBIO || com == FIOASYNC) /* XXX */ return (0); /* XXX */ /* fall into ... */ default: return (ENOTTY); case VFIFO: case VCHR: case VBLK: error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ if (p->p_session->s_ttyvp == vp) return (0); /* Get rid of reference to old control tty */ if (p->p_session->s_ttyvp) vrele(p->p_session->s_ttyvp); p->p_session->s_ttyvp = vp; VREF(vp); } return (error); } } /* * File table vnode select routine. */ static int vn_select(fp, which, p) struct file *fp; int which; struct proc *p; { return (VOP_SELECT(((struct vnode *)fp->f_data), which, fp->f_flag, fp->f_cred, p)); } /* * File table vnode close routine. */ static int vn_closefile(fp, p) struct file *fp; struct proc *p; { return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, fp->f_cred, p)); } static int vn_vmio_open(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { struct vattr vat; int error; /* * this is here for VMIO support */ - if (vp->v_type == VREG || vp->v_type == VBLK) { + if (vp->v_type == VREG /* || vp->v_type == VBLK */) { retry: if ((vp->v_flag & VVMIO) == 0) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) return error; - (void) vnode_pager_alloc(vp, vat.va_size, 0, 0); + (void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0); vp->v_flag |= VVMIO; } else { vm_object_t object; if ((object = vp->v_object) && (object->flags & OBJ_DEAD)) { VOP_UNLOCK(vp); tsleep(object, PVM, "vodead", 0); VOP_LOCK(vp); goto retry; } if (!object) panic("vn_open: VMIO object missing"); vm_object_reference(object); } } return 0; } void vn_vmio_close(vp) struct vnode *vp; { /* * this code is here for VMIO support, will eventually * be in vfs code. */ if (vp->v_flag & VVMIO) { vrele(vp); if (vp->v_object == NULL) panic("vn_close: VMIO object missing"); vm_object_deallocate(vp->v_object); } else vrele(vp); } Index: head/sys/miscfs/procfs/procfs_mem.c =================================================================== --- head/sys/miscfs/procfs/procfs_mem.c (revision 13489) +++ head/sys/miscfs/procfs/procfs_mem.c (revision 13490) @@ -1,246 +1,247 @@ /* * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 Sean Eric Fagan * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry and Sean Eric Fagan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_mem.c 8.4 (Berkeley) 1/21/94 * - * $Id: procfs_mem.c,v 1.13 1995/12/11 04:56:31 dyson Exp $ + * $Id: procfs_mem.c,v 1.14 1995/12/17 07:19:24 bde Exp $ */ /* * This is a lightly hacked and merged version * of sef's pread/pwrite functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int procfs_rwmem __P((struct proc *p, struct uio *uio)); static int procfs_rwmem(p, uio) struct proc *p; struct uio *uio; { int error; int writing; writing = uio->uio_rw == UIO_WRITE; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t map, tmap; vm_object_t object; vm_offset_t kva = 0; vm_offset_t uva; int page_offset; /* offset into page */ vm_offset_t pageno; /* page number */ vm_map_entry_t out_entry; vm_prot_t out_prot; vm_page_t m; boolean_t wired, single_use; vm_pindex_t pindex; u_int len; int fix_prot; uva = (vm_offset_t) uio->uio_offset; if (uva >= VM_MAXUSER_ADDRESS) { if (writing || (uva >= (VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE))) { error = 0; break; } } /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * Check the permissions for the area we're interested * in. */ fix_prot = 0; if (writing) fix_prot = !vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_WRITE); if (fix_prot) { /* * If the page is not writable, we make it so. * XXX It is possible that a page may *not* be * read/executable, if a process changes that! * We will assume, for now, that a page is either * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE. */ error = vm_map_protect(map, pageno, pageno + PAGE_SIZE, VM_PROT_ALL, 0); if (error) break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, writing ? VM_PROT_WRITE : VM_PROT_READ, &out_entry, &object, &pindex, &out_prot, &wired, &single_use); /* * We're done with tmap now. */ if (!error) vm_map_lookup_done(tmap, out_entry); /* * Fault the page in... */ if (!error && writing && object->backing_object) { m = vm_page_lookup(object, pindex); if (m == 0) error = vm_fault(map, pageno, VM_PROT_WRITE, FALSE); } /* Find space in kernel_map for the page we're interested in */ if (!error) error = vm_map_find(kernel_map, object, - IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1); + IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); if (!error) { /* * Neither vm_map_lookup() nor vm_map_find() appear * to add a reference count to the object, so we do * that here and now. */ vm_object_reference(object); /* * Mark the page we just found as pageable. */ error = vm_map_pageable(kernel_map, kva, kva + PAGE_SIZE, 0); /* * Now do the i/o move. */ if (!error) error = uiomove((caddr_t)(kva + page_offset), len, uio); vm_map_remove(kernel_map, kva, kva + PAGE_SIZE); } if (fix_prot) vm_map_protect(map, pageno, pageno + PAGE_SIZE, VM_PROT_READ|VM_PROT_EXECUTE, 0); } while (error == 0 && uio->uio_resid > 0); return (error); } /* * Copy data in and out of the target process. * We do this by mapping the process's page into * the kernel and then doing a uiomove direct * from the kernel address space. */ int procfs_domem(curp, p, pfs, uio) struct proc *curp; struct proc *p; struct pfsnode *pfs; struct uio *uio; { int error; if (uio->uio_resid == 0) return (0); error = procfs_rwmem(p, uio); return (error); } /* * Given process (p), find the vnode from which * it's text segment is being executed. * * It would be nice to grab this information from * the VM system, however, there is no sure-fire * way of doing that. Instead, fork(), exec() and * wait() all maintain the p_textvp field in the * process proc structure which contains a held * reference to the exec'ed vnode. */ struct vnode * procfs_findtextvp(p) struct proc *p; { return (p->p_textvp); } Index: head/sys/msdosfs/msdosfs_denode.c =================================================================== --- head/sys/msdosfs/msdosfs_denode.c (revision 13489) +++ head/sys/msdosfs/msdosfs_denode.c (revision 13490) @@ -1,728 +1,730 @@ -/* $Id: msdosfs_denode.c,v 1.14 1995/12/03 16:41:53 bde Exp $ */ +/* $Id: msdosfs_denode.c,v 1.15 1995/12/07 12:47:19 davidg Exp $ */ /* $NetBSD: msdosfs_denode.c,v 1.9 1994/08/21 18:44:00 ws Exp $ */ /*- * Copyright (C) 1994 Wolfgang Solfrank. * Copyright (C) 1994 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include /* defines "time" */ #include #include #include #include #include #include #include #include struct denode **dehashtbl; u_long dehash; /* size of hash table - 1 */ #define DEHASH(dev, deno) (((dev) + (deno)) & dehash) union _qcvt { quad_t qcvt; long val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } static struct denode * msdosfs_hashget __P((dev_t dev, u_long dirclust, u_long diroff)); static void msdosfs_hashins __P((struct denode *dep)); static void msdosfs_hashrem __P((struct denode *dep)); int msdosfs_init() { dehashtbl = hashinit(desiredvnodes/2, M_MSDOSFSMNT, &dehash); return 0; } static struct denode * msdosfs_hashget(dev, dirclust, diroff) dev_t dev; u_long dirclust; u_long diroff; { struct denode *dep; for (;;) for (dep = dehashtbl[DEHASH(dev, dirclust + diroff)];; dep = dep->de_next) { if (dep == NULL) return NULL; if (dirclust != dep->de_dirclust || diroff != dep->de_diroffset || dev != dep->de_dev || dep->de_refcnt == 0) continue; if (dep->de_flag & DE_LOCKED) { dep->de_flag |= DE_WANTED; (void) tsleep((caddr_t)dep, PINOD, "msdhgt", 0); break; } if (!vget(DETOV(dep), 1)) return dep; break; } /* NOTREACHED */ } static void msdosfs_hashins(dep) struct denode *dep; { struct denode **depp, *deq; depp = &dehashtbl[DEHASH(dep->de_dev, dep->de_dirclust + dep->de_diroffset)]; deq = *depp; if (deq) deq->de_prev = &dep->de_next; dep->de_next = deq; dep->de_prev = depp; *depp = dep; } static void msdosfs_hashrem(dep) struct denode *dep; { struct denode *deq; deq = dep->de_next; if (deq) deq->de_prev = dep->de_prev; *dep->de_prev = deq; #ifdef DIAGNOSTIC dep->de_next = NULL; dep->de_prev = NULL; #endif } /* * If deget() succeeds it returns with the gotten denode locked(). * * pmp - address of msdosfsmount structure of the filesystem containing * the denode of interest. The pm_dev field and the address of * the msdosfsmount structure are used. * dirclust - which cluster bp contains, if dirclust is 0 (root directory) * diroffset is relative to the beginning of the root directory, * otherwise it is cluster relative. * diroffset - offset past begin of cluster of denode we want * direntptr - address of the direntry structure of interest. If direntptr is * NULL, the block is read if necessary. * depp - returns the address of the gotten denode. */ int deget(pmp, dirclust, diroffset, direntptr, depp) struct msdosfsmount *pmp; /* so we know the maj/min number */ u_long dirclust; /* cluster this dir entry came from */ u_long diroffset; /* index of entry within the cluster */ struct direntry *direntptr; struct denode **depp; /* returns the addr of the gotten denode */ { int error; dev_t dev = pmp->pm_dev; struct mount *mntp = pmp->pm_mountp; struct denode *ldep; struct vnode *nvp; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("deget(pmp %p, dirclust %ld, diroffset %x, direntptr %p, depp %p)\n", pmp, dirclust, diroffset, direntptr, depp); #endif /* * If dir entry is given and refers to a directory, convert to * canonical form */ if (direntptr && (direntptr->deAttributes & ATTR_DIRECTORY)) { dirclust = getushort(direntptr->deStartCluster); if (dirclust == MSDOSFSROOT) diroffset = MSDOSFSROOT_OFS; else diroffset = 0; } /* * See if the denode is in the denode cache. Use the location of * the directory entry to compute the hash value. For subdir use * address of "." entry. for root dir use cluster MSDOSFSROOT, * offset MSDOSFSROOT_OFS * * NOTE: The check for de_refcnt > 0 below insures the denode being * examined does not represent an unlinked but still open file. * These files are not to be accessible even when the directory * entry that represented the file happens to be reused while the * deleted file is still open. */ ldep = msdosfs_hashget(dev, dirclust, diroffset); if (ldep) { *depp = ldep; return 0; } /* * Directory entry was not in cache, have to create a vnode and * copy it from the passed disk buffer. */ /* getnewvnode() does a VREF() on the vnode */ error = getnewvnode(VT_MSDOSFS, mntp, msdosfs_vnodeop_p, &nvp); if (error) { *depp = 0; return error; } MALLOC(ldep, struct denode *, sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK); bzero((caddr_t)ldep, sizeof *ldep); nvp->v_data = ldep; ldep->de_vnode = nvp; ldep->de_flag = 0; ldep->de_devvp = 0; ldep->de_lockf = 0; ldep->de_dev = dev; ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; fc_purge(ldep, 0); /* init the fat cache for this denode */ /* * Insert the denode into the hash queue and lock the denode so it * can't be accessed until we've read it in and have done what we * need to it. */ VOP_LOCK(nvp); msdosfs_hashins(ldep); /* * Copy the directory entry into the denode area of the vnode. */ if (dirclust == MSDOSFSROOT && diroffset == MSDOSFSROOT_OFS) { /* * Directory entry for the root directory. There isn't one, * so we manufacture one. We should probably rummage * through the root directory and find a label entry (if it * exists), and then use the time and date from that entry * as the time and date for the root denode. */ ldep->de_Attributes = ATTR_DIRECTORY; ldep->de_StartCluster = MSDOSFSROOT; ldep->de_FileSize = pmp->pm_rootdirsize * pmp->pm_BytesPerSec; /* * fill in time and date so that dos2unixtime() doesn't * spit up when called from msdosfs_getattr() with root * denode */ ldep->de_Time = 0x0000; /* 00:00:00 */ ldep->de_Date = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT) | (1 << DD_DAY_SHIFT); /* Jan 1, 1980 */ /* leave the other fields as garbage */ } else { bp = NULL; if (!direntptr) { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) return error; } DE_INTERNALIZE(ldep, direntptr); if (bp) brelse(bp); } /* * Fill in a few fields of the vnode and finish filling in the * denode. Then return the address of the found denode. */ ldep->de_pmp = pmp; ldep->de_devvp = pmp->pm_devvp; ldep->de_refcnt = 1; if (ldep->de_Attributes & ATTR_DIRECTORY) { /* * Since DOS directory entries that describe directories * have 0 in the filesize field, we take this opportunity * to find out the length of the directory and plug it into * the denode structure. */ u_long size; nvp->v_type = VDIR; if (ldep->de_StartCluster == MSDOSFSROOT) nvp->v_flag |= VROOT; else { error = pcbmap(ldep, 0xffff, 0, &size); if (error == E2BIG) { ldep->de_FileSize = size << pmp->pm_cnshift; error = 0; } else printf("deget(): pcbmap returned %d\n", error); } } else nvp->v_type = VREG; SETHIGH(ldep->de_modrev, mono_time.tv_sec); SETLOW(ldep->de_modrev, mono_time.tv_usec * 4294); VREF(ldep->de_devvp); *depp = ldep; return 0; } int deupdat(dep, tp, waitfor) struct denode *dep; struct timespec *tp; int waitfor; { int error; struct buf *bp; struct direntry *dirp; struct vnode *vp = DETOV(dep); #ifdef MSDOSFS_DEBUG printf("deupdat(): dep %p\n", dep); #endif /* * If the denode-modified and update-mtime bits are off, * or this denode is from a readonly filesystem, * or this denode is for a directory, * or the denode represents an open but unlinked file, * then don't do anything. DOS directory * entries that describe a directory do not ever get * updated. This is the way DOS treats them. */ if ((dep->de_flag & (DE_MODIFIED | DE_UPDATE)) == 0 || vp->v_mount->mnt_flag & MNT_RDONLY || dep->de_Attributes & ATTR_DIRECTORY || dep->de_refcnt <= 0) return 0; /* * Read in the cluster containing the directory entry we want to * update. */ error = readde(dep, &bp, &dirp); if (error) return error; /* * If the mtime is to be updated, put the passed in time into the * directory entry. */ if (dep->de_flag & DE_UPDATE) { dep->de_Attributes |= ATTR_ARCHIVE; unix2dostime(tp, &dep->de_Date, &dep->de_Time); } /* * The mtime is now up to date. The denode will be unmodifed soon. */ dep->de_flag &= ~(DE_MODIFIED | DE_UPDATE); /* * Copy the directory entry out of the denode into the cluster it * came from. */ DE_EXTERNALIZE(dirp, dep); /* * Write the cluster back to disk. If they asked for us to wait * for the write to complete, then use bwrite() otherwise use * bdwrite(). */ error = 0; /* note that error is 0 from above, but ... */ if (waitfor) error = bwrite(bp); else bdwrite(bp); return error; } /* * Truncate the file described by dep to the length specified by length. */ int detrunc(dep, length, flags, cred, p) struct denode *dep; u_long length; int flags; struct ucred *cred; struct proc *p; { int error; int allerror; int vflags; u_long eofentry; u_long chaintofree; daddr_t bn; int boff; int isadir = dep->de_Attributes & ATTR_DIRECTORY; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; struct timespec ts; #ifdef MSDOSFS_DEBUG printf("detrunc(): file %s, length %d, flags %d\n", dep->de_Name, length, flags); #endif /* * Disallow attempts to truncate the root directory since it is of * fixed size. That's just the way dos filesystems are. We use * the VROOT bit in the vnode because checking for the directory * bit and a startcluster of 0 in the denode is not adequate to * recognize the root directory at this point in a file or * directory's life. */ if (DETOV(dep)->v_flag & VROOT) { printf( "detrunc(): can't truncate root directory, clust %ld, offset %ld\n", dep->de_dirclust, dep->de_diroffset); return EINVAL; } - vnode_pager_setsize(DETOV(dep), length); - if (dep->de_FileSize < length) + if (dep->de_FileSize < length) { + vnode_pager_setsize(DETOV(dep), length); return deextend(dep, length, cred); + } /* * If the desired length is 0 then remember the starting cluster of * the file and set the StartCluster field in the directory entry * to 0. If the desired length is not zero, then get the number of * the last cluster in the shortened file. Then get the number of * the first cluster in the part of the file that is to be freed. * Then set the next cluster pointer in the last cluster of the * file to CLUST_EOFE. */ if (length == 0) { chaintofree = dep->de_StartCluster; dep->de_StartCluster = 0; eofentry = ~0; } else { error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): pcbmap fails %d\n", error); #endif return error; } } fc_purge(dep, (length + pmp->pm_crbomask) >> pmp->pm_cnshift); /* * If the new length is not a multiple of the cluster size then we * must zero the tail end of the new last cluster in case it * becomes part of the file again because of a seek. */ if ((boff = length & pmp->pm_crbomask) != 0) { /* * should read from file vnode or filesystem vnode * depending on if file or dir */ if (isadir) { bn = cntobn(pmp, eofentry); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); } else { bn = de_blk(pmp, length); error = bread(DETOV(dep), bn, pmp->pm_bpcluster, NOCRED, &bp); } if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): bread fails %d\n", error); #endif return error; } /* * is this the right place for it? */ bzero(bp->b_data + boff, pmp->pm_bpcluster - boff); if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } /* * Write out the updated directory entry. Even if the update fails * we free the trailing clusters. */ dep->de_FileSize = length; dep->de_flag |= DE_UPDATE; vflags = (length > 0 ? V_SAVE : 0) | V_SAVEMETA; vinvalbuf(DETOV(dep), vflags, cred, p, 0, 0); + vnode_pager_setsize(DETOV(dep), length); TIMEVAL_TO_TIMESPEC(&time, &ts); allerror = deupdat(dep, &ts, 1); #ifdef MSDOSFS_DEBUG printf("detrunc(): allerror %d, eofentry %d\n", allerror, eofentry); #endif /* * If we need to break the cluster chain for the file then do it * now. */ if (eofentry != ~0) { error = fatentry(FAT_GET_AND_SET, pmp, eofentry, &chaintofree, CLUST_EOFE); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): fatentry errors %d\n", error); #endif return error; } fc_setcache(dep, FC_LASTFC, (length - 1) >> pmp->pm_cnshift, eofentry); } /* * Now free the clusters removed from the file because of the * truncation. */ if (chaintofree != 0 && !MSDOSFSEOF(chaintofree)) freeclusterchain(pmp, chaintofree); return allerror; } /* * Extend the file described by dep to length specified by length. */ int deextend(dep, length, cred) struct denode *dep; off_t length; struct ucred *cred; { struct msdosfsmount *pmp = dep->de_pmp; u_long count; int error; struct timespec ts; /* * The root of a DOS filesystem cannot be extended. */ if (DETOV(dep)->v_flag & VROOT) return EINVAL; /* * Directories can only be extended by the superuser. * Is this really important? */ if (dep->de_Attributes & ATTR_DIRECTORY) { error = suser(cred, NULL); if (error) return error; } if (length <= dep->de_FileSize) panic("deextend: file too large"); /* * Compute the number of clusters to allocate. */ count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize); if (count > 0) { if (count > pmp->pm_freeclustercount) return ENOSPC; error = extendfile(dep, count, NULL, NULL, DE_CLEAR); if (error) { /* truncate the added clusters away again */ (void) detrunc(dep, dep->de_FileSize, 0, cred, NULL); return error; } } dep->de_flag |= DE_UPDATE; dep->de_FileSize = length; TIMEVAL_TO_TIMESPEC(&time, &ts); return deupdat(dep, &ts, 1); } /* * Move a denode to its correct hash queue after the file it represents has * been moved to a new directory. */ int reinsert(dep) struct denode *dep; { /* * Fix up the denode cache. If the denode is for a directory, * there is nothing to do since the hash is based on the starting * cluster of the directory file and that hasn't changed. If for a * file the hash is based on the location of the directory entry, * so we must remove it from the cache and re-enter it with the * hash based on the new location of the directory entry. */ if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) { msdosfs_hashrem(dep); msdosfs_hashins(dep); } return 0; } int msdosfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); #ifdef MSDOSFS_DEBUG printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n", dep, dep->de_Name, dep->de_refcnt); #endif if (prtactive && vp->v_usecount != 0) vprint("msdosfs_reclaim(): pushing active", vp); /* * Remove the denode from the denode hash chain we are in. */ msdosfs_hashrem(dep); cache_purge(vp); /* * Indicate that one less file on the filesystem is open. */ if (dep->de_devvp) { vrele(dep->de_devvp); dep->de_devvp = 0; } dep->de_flag = 0; FREE(dep, M_MSDOSFSNODE); vp->v_data = NULL; return 0; } int msdosfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); int error = 0; struct timespec ts; #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]); #endif if (prtactive && vp->v_usecount != 0) vprint("msdosfs_inactive(): pushing active", vp); /* * Get rid of denodes related to stale file handles. Hmmm, what * does this really do? */ if (dep->de_Name[0] == SLOT_DELETED) { if ((vp->v_flag & VXLOCK) == 0) vgone(vp); return 0; } /* * If the file has been deleted and it is on a read/write * filesystem, then truncate the file, and mark the directory slot * as empty. (This may not be necessary for the dos filesystem.) */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %x, MNT_RDONLY %x\n", dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY); #endif VOP_LOCK(vp); if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { error = detrunc(dep, (u_long) 0, 0, NOCRED, NULL); dep->de_flag |= DE_UPDATE; dep->de_Name[0] = SLOT_DELETED; } if (dep->de_flag & (DE_MODIFIED | DE_UPDATE)) { TIMEVAL_TO_TIMESPEC(&time, &ts); deupdat(dep, &ts, 0); } VOP_UNLOCK(vp); dep->de_flag = 0; /* * If we are done with the denode, then reclaim it so that it can * be reused now. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vp->v_usecount, dep->de_Name[0]); #endif if (vp->v_usecount == 0 && dep->de_Name[0] == SLOT_DELETED) vgone(vp); return error; } Index: head/sys/nfs/nfs_common.c =================================================================== --- head/sys/nfs/nfs_common.c (revision 13489) +++ head/sys/nfs/nfs_common.c (revision 13490) @@ -1,1979 +1,1979 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $ + * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef VFS_LKM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_long nfs_xdrneg1; u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_long nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; #ifndef NFS_NOSERVER /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; #ifdef VFS_LKM struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); #endif LIST_HEAD(nfsnodehashhead, nfsnode); /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_long *xidp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED); if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_long *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain... */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EINVAL); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; char *cp; long siz; { register struct mbuf *m1 = 0, *m2; long left, xfer, len, tlen; u_long *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_long *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_long *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init() { register int i; /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfsnode) > NFS_NODEALLOC) { printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC); printf("Try reducing NFS_SMALLFH\n"); } if (sizeof (struct nfsmount) > NFS_MNTALLOC) { printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC); printf("Try reducing NFS_MUIDHASHSIZ\n"); } if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) nfs_iodwant[i] = (struct proc *)0; TAILQ_INIT(&nfs_bufq); nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); #ifndef NFS_NOSERVER nfs_timer(0); #endif #ifdef __FreeBSD__ /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER lease_check = nfs_lease_check; #endif lease_updatetime = nfs_lease_updatetime; vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */ #ifdef VFS_LKM sysent[SYS_nfssvc].sy_narg = 2; sysent[SYS_nfssvc].sy_call = nfssvc; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = 2; sysent[SYS_getfh].sy_call = getfh; #endif #endif #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register struct nfsnodehashhead *nhpp; register long t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1), fxdr_unsigned(u_char, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(long, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { /* * If we had a lock and it turns out that the vnode * is an object which we don't want to lock (e.g. VDIR) * to avoid nasty hanging problems on a server crash, * then release it here. */ if (vtyp != VREG && VOP_ISLOCKED(vp)) VOP_UNLOCK(vp); vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, (dev_t)rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. */ LIST_REMOVE(np, n_hash); nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize)); LIST_INSERT_HEAD(nhpp, np, n_hash); *vpp = vp = nvp; } } np->n_mtime = mtime.ts_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = (dev_t)rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); fxdr_hyper(&fp->fa3_size, &vap->va_size); vap->va_blocksize = NFS_FABLKSIZE; fxdr_hyper(&fp->fa3_used, &vap->va_bytes); vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_long, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.ts_nsec = 0; vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np = VTONFS(vp); register struct vattr *vap; if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; vap = &np->n_vattr; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct mbuf *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp; struct vnode *dp; int error, rdonly; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || *fromcp == '/') { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if (error = nfs_adv(mdp, dposp, len, rem)) goto out; } ndp->ni_pathlen = tocp - cnp->cn_pnbuf; cnp->cn_nameptr = cnp->cn_pnbuf; /* * Extract and set starting directory. */ if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag)) goto out; if (dp->v_type != VDIR) { nfsrv_vrele(dp); error = ENOTDIR; goto out; } VREF(dp); *retdirp = dp; ndp->ni_startdir = dp; if (rdonly) cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); else cnp->cn_flags |= NOCROSSMOUNT; /* * And call lookup() to do the real work */ cnp->cn_proc = p; if (error = lookup(ndp)) goto out; /* * Check for encountering a symbolic link */ if (cnp->cn_flags & ISSYMLINK) { if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vput(ndp->ni_vp); ndp->ni_vp = NULL; error = EINVAL; goto out; } nfsrv_vmio(ndp->ni_vp); /* * Check for saved name request */ if (cnp->cn_flags & (SAVENAME | SAVESTART)) { cnp->cn_flags |= HASBUF; return (0); } out: FREE(cnp->cn_pnbuf, M_NAMEI); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; if (before_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(&(before_vap->va_size), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(&vap->va_size, &fp->fa3_size); txdr_hyper(&vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct mbuf *nam; int *rdonlyp; int kerbflag; { register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; *vpp = (struct vnode *)0; mp = getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_vmio(*vpp); if (!lockflag) VOP_UNLOCK(*vpp); return (0); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct mbuf *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = mtod(nam, struct sockaddr_in *); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = mtod(nam, struct sockaddr_iso *); isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { 0, 0 }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = off / NFS_DIRBLKSIZ; if (pos == 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_vmio(struct vnode *vp) { vm_object_t object; if ((vp == NULL) || (vp->v_type != VREG)) return 1; retry: if ((vp->v_flag & VVMIO) == 0) { struct vattr vat; struct proc *p = curproc; if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0) panic("nfsrv_vmio: VOP_GETATTR failed"); - (void) vnode_pager_alloc(vp, vat.va_size, 0, 0); + (void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0); vp->v_flag |= VVMIO; } else { if ((object = vp->v_object) && (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "nfdead", 0); goto retry; } if (!object) panic("nfsrv_vmio: VMIO object missing"); vm_object_reference(object); } return 0; } int nfsrv_vput(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vput(vp); vm_object_deallocate(vp->v_object); } else { vput(vp); } return 0; } int nfsrv_vrele(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vrele(vp); vm_object_deallocate(vp->v_object); } else { vrele(vp); } return 0; } #endif /* NFS_NOSERVER */ Index: head/sys/nfs/nfs_subs.c =================================================================== --- head/sys/nfs/nfs_subs.c (revision 13489) +++ head/sys/nfs/nfs_subs.c (revision 13490) @@ -1,1979 +1,1979 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $ + * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef VFS_LKM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_long nfs_xdrneg1; u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_long nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; #ifndef NFS_NOSERVER /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; #ifdef VFS_LKM struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); #endif LIST_HEAD(nfsnodehashhead, nfsnode); /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_long *xidp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED); if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_long *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain... */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EINVAL); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; char *cp; long siz; { register struct mbuf *m1 = 0, *m2; long left, xfer, len, tlen; u_long *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_long *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_long *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init() { register int i; /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfsnode) > NFS_NODEALLOC) { printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC); printf("Try reducing NFS_SMALLFH\n"); } if (sizeof (struct nfsmount) > NFS_MNTALLOC) { printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC); printf("Try reducing NFS_MUIDHASHSIZ\n"); } if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) nfs_iodwant[i] = (struct proc *)0; TAILQ_INIT(&nfs_bufq); nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); #ifndef NFS_NOSERVER nfs_timer(0); #endif #ifdef __FreeBSD__ /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER lease_check = nfs_lease_check; #endif lease_updatetime = nfs_lease_updatetime; vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */ #ifdef VFS_LKM sysent[SYS_nfssvc].sy_narg = 2; sysent[SYS_nfssvc].sy_call = nfssvc; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = 2; sysent[SYS_getfh].sy_call = getfh; #endif #endif #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register struct nfsnodehashhead *nhpp; register long t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1), fxdr_unsigned(u_char, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(long, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { /* * If we had a lock and it turns out that the vnode * is an object which we don't want to lock (e.g. VDIR) * to avoid nasty hanging problems on a server crash, * then release it here. */ if (vtyp != VREG && VOP_ISLOCKED(vp)) VOP_UNLOCK(vp); vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, (dev_t)rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. */ LIST_REMOVE(np, n_hash); nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize)); LIST_INSERT_HEAD(nhpp, np, n_hash); *vpp = vp = nvp; } } np->n_mtime = mtime.ts_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = (dev_t)rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); fxdr_hyper(&fp->fa3_size, &vap->va_size); vap->va_blocksize = NFS_FABLKSIZE; fxdr_hyper(&fp->fa3_used, &vap->va_bytes); vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_long, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.ts_nsec = 0; vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np = VTONFS(vp); register struct vattr *vap; if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; vap = &np->n_vattr; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct mbuf *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp; struct vnode *dp; int error, rdonly; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || *fromcp == '/') { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if (error = nfs_adv(mdp, dposp, len, rem)) goto out; } ndp->ni_pathlen = tocp - cnp->cn_pnbuf; cnp->cn_nameptr = cnp->cn_pnbuf; /* * Extract and set starting directory. */ if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag)) goto out; if (dp->v_type != VDIR) { nfsrv_vrele(dp); error = ENOTDIR; goto out; } VREF(dp); *retdirp = dp; ndp->ni_startdir = dp; if (rdonly) cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); else cnp->cn_flags |= NOCROSSMOUNT; /* * And call lookup() to do the real work */ cnp->cn_proc = p; if (error = lookup(ndp)) goto out; /* * Check for encountering a symbolic link */ if (cnp->cn_flags & ISSYMLINK) { if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vput(ndp->ni_vp); ndp->ni_vp = NULL; error = EINVAL; goto out; } nfsrv_vmio(ndp->ni_vp); /* * Check for saved name request */ if (cnp->cn_flags & (SAVENAME | SAVESTART)) { cnp->cn_flags |= HASBUF; return (0); } out: FREE(cnp->cn_pnbuf, M_NAMEI); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; if (before_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(&(before_vap->va_size), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(&vap->va_size, &fp->fa3_size); txdr_hyper(&vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct mbuf *nam; int *rdonlyp; int kerbflag; { register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; *vpp = (struct vnode *)0; mp = getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_vmio(*vpp); if (!lockflag) VOP_UNLOCK(*vpp); return (0); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct mbuf *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = mtod(nam, struct sockaddr_in *); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = mtod(nam, struct sockaddr_iso *); isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { 0, 0 }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = off / NFS_DIRBLKSIZ; if (pos == 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_vmio(struct vnode *vp) { vm_object_t object; if ((vp == NULL) || (vp->v_type != VREG)) return 1; retry: if ((vp->v_flag & VVMIO) == 0) { struct vattr vat; struct proc *p = curproc; if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0) panic("nfsrv_vmio: VOP_GETATTR failed"); - (void) vnode_pager_alloc(vp, vat.va_size, 0, 0); + (void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0); vp->v_flag |= VVMIO; } else { if ((object = vp->v_object) && (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "nfdead", 0); goto retry; } if (!object) panic("nfsrv_vmio: VMIO object missing"); vm_object_reference(object); } return 0; } int nfsrv_vput(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vput(vp); vm_object_deallocate(vp->v_object); } else { vput(vp); } return 0; } int nfsrv_vrele(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vrele(vp); vm_object_deallocate(vp->v_object); } else { vrele(vp); } return 0; } #endif /* NFS_NOSERVER */ Index: head/sys/nfsclient/nfs_subs.c =================================================================== --- head/sys/nfsclient/nfs_subs.c (revision 13489) +++ head/sys/nfsclient/nfs_subs.c (revision 13490) @@ -1,1979 +1,1979 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $ + * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef VFS_LKM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_long nfs_xdrneg1; u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_long nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; #ifndef NFS_NOSERVER /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; #ifdef VFS_LKM struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); #endif LIST_HEAD(nfsnodehashhead, nfsnode); /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_long *xidp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED); if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_long *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain... */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EINVAL); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; char *cp; long siz; { register struct mbuf *m1 = 0, *m2; long left, xfer, len, tlen; u_long *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_long *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_long *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init() { register int i; /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfsnode) > NFS_NODEALLOC) { printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC); printf("Try reducing NFS_SMALLFH\n"); } if (sizeof (struct nfsmount) > NFS_MNTALLOC) { printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC); printf("Try reducing NFS_MUIDHASHSIZ\n"); } if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) nfs_iodwant[i] = (struct proc *)0; TAILQ_INIT(&nfs_bufq); nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); #ifndef NFS_NOSERVER nfs_timer(0); #endif #ifdef __FreeBSD__ /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER lease_check = nfs_lease_check; #endif lease_updatetime = nfs_lease_updatetime; vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */ #ifdef VFS_LKM sysent[SYS_nfssvc].sy_narg = 2; sysent[SYS_nfssvc].sy_call = nfssvc; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = 2; sysent[SYS_getfh].sy_call = getfh; #endif #endif #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register struct nfsnodehashhead *nhpp; register long t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1), fxdr_unsigned(u_char, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(long, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { /* * If we had a lock and it turns out that the vnode * is an object which we don't want to lock (e.g. VDIR) * to avoid nasty hanging problems on a server crash, * then release it here. */ if (vtyp != VREG && VOP_ISLOCKED(vp)) VOP_UNLOCK(vp); vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, (dev_t)rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. */ LIST_REMOVE(np, n_hash); nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize)); LIST_INSERT_HEAD(nhpp, np, n_hash); *vpp = vp = nvp; } } np->n_mtime = mtime.ts_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = (dev_t)rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); fxdr_hyper(&fp->fa3_size, &vap->va_size); vap->va_blocksize = NFS_FABLKSIZE; fxdr_hyper(&fp->fa3_used, &vap->va_bytes); vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_long, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.ts_nsec = 0; vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np = VTONFS(vp); register struct vattr *vap; if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; vap = &np->n_vattr; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct mbuf *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp; struct vnode *dp; int error, rdonly; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || *fromcp == '/') { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if (error = nfs_adv(mdp, dposp, len, rem)) goto out; } ndp->ni_pathlen = tocp - cnp->cn_pnbuf; cnp->cn_nameptr = cnp->cn_pnbuf; /* * Extract and set starting directory. */ if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag)) goto out; if (dp->v_type != VDIR) { nfsrv_vrele(dp); error = ENOTDIR; goto out; } VREF(dp); *retdirp = dp; ndp->ni_startdir = dp; if (rdonly) cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); else cnp->cn_flags |= NOCROSSMOUNT; /* * And call lookup() to do the real work */ cnp->cn_proc = p; if (error = lookup(ndp)) goto out; /* * Check for encountering a symbolic link */ if (cnp->cn_flags & ISSYMLINK) { if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vput(ndp->ni_vp); ndp->ni_vp = NULL; error = EINVAL; goto out; } nfsrv_vmio(ndp->ni_vp); /* * Check for saved name request */ if (cnp->cn_flags & (SAVENAME | SAVESTART)) { cnp->cn_flags |= HASBUF; return (0); } out: FREE(cnp->cn_pnbuf, M_NAMEI); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; if (before_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(&(before_vap->va_size), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(&vap->va_size, &fp->fa3_size); txdr_hyper(&vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct mbuf *nam; int *rdonlyp; int kerbflag; { register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; *vpp = (struct vnode *)0; mp = getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_vmio(*vpp); if (!lockflag) VOP_UNLOCK(*vpp); return (0); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct mbuf *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = mtod(nam, struct sockaddr_in *); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = mtod(nam, struct sockaddr_iso *); isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { 0, 0 }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = off / NFS_DIRBLKSIZ; if (pos == 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_vmio(struct vnode *vp) { vm_object_t object; if ((vp == NULL) || (vp->v_type != VREG)) return 1; retry: if ((vp->v_flag & VVMIO) == 0) { struct vattr vat; struct proc *p = curproc; if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0) panic("nfsrv_vmio: VOP_GETATTR failed"); - (void) vnode_pager_alloc(vp, vat.va_size, 0, 0); + (void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0); vp->v_flag |= VVMIO; } else { if ((object = vp->v_object) && (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "nfdead", 0); goto retry; } if (!object) panic("nfsrv_vmio: VMIO object missing"); vm_object_reference(object); } return 0; } int nfsrv_vput(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vput(vp); vm_object_deallocate(vp->v_object); } else { vput(vp); } return 0; } int nfsrv_vrele(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vrele(vp); vm_object_deallocate(vp->v_object); } else { vrele(vp); } return 0; } #endif /* NFS_NOSERVER */ Index: head/sys/nfsserver/nfs_srvsubs.c =================================================================== --- head/sys/nfsserver/nfs_srvsubs.c (revision 13489) +++ head/sys/nfsserver/nfs_srvsubs.c (revision 13490) @@ -1,1979 +1,1979 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 - * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $ + * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $ */ /* * These functions support the macros and help fiddle mbuf chains for * the nfs op functions. They do things like create the rpc header and * copy data between mbuf chains and uio lists. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef VFS_LKM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ISO #include #endif /* * Data items converted to xdr at startup, since they are constant * This is kinda hokey, but may save a little time doing byte swaps */ u_long nfs_xdrneg1; u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_auth_kerb; u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false; /* And other global data */ static u_long nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; enum vtype nv3tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO }; int nfs_ticks; struct nfs_reqq nfs_reqq; struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; int nfsd_head_flag; struct nfs_bufq nfs_bufq; struct nqtimerhead nqtimerhead; struct nqfhhashhead *nqfhhashtbl; u_long nqfhhash; #ifndef NFS_NOSERVER /* * Mapping of old NFS Version 2 RPC numbers to generic numbers. */ int nfsv3_procid[NFS_NPROCS] = { NFSPROC_NULL, NFSPROC_GETATTR, NFSPROC_SETATTR, NFSPROC_NOOP, NFSPROC_LOOKUP, NFSPROC_READLINK, NFSPROC_READ, NFSPROC_NOOP, NFSPROC_WRITE, NFSPROC_CREATE, NFSPROC_REMOVE, NFSPROC_RENAME, NFSPROC_LINK, NFSPROC_SYMLINK, NFSPROC_MKDIR, NFSPROC_RMDIR, NFSPROC_READDIR, NFSPROC_FSSTAT, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP, NFSPROC_NOOP }; #endif /* NFS_NOSERVER */ /* * and the reverse mapping from generic to Version 2 procedure numbers */ int nfsv2_procid[NFS_NPROCS] = { NFSV2PROC_NULL, NFSV2PROC_GETATTR, NFSV2PROC_SETATTR, NFSV2PROC_LOOKUP, NFSV2PROC_NOOP, NFSV2PROC_READLINK, NFSV2PROC_READ, NFSV2PROC_WRITE, NFSV2PROC_CREATE, NFSV2PROC_MKDIR, NFSV2PROC_SYMLINK, NFSV2PROC_CREATE, NFSV2PROC_REMOVE, NFSV2PROC_RMDIR, NFSV2PROC_RENAME, NFSV2PROC_LINK, NFSV2PROC_READDIR, NFSV2PROC_NOOP, NFSV2PROC_STATFS, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, NFSV2PROC_NOOP, }; #ifndef NFS_NOSERVER /* * Maps errno values to nfs error numbers. * Use NFSERR_IO as the catch all for ones not specifically defined in * RFC 1094. */ static u_char nfsrv_v2errmap[ELAST] = { NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, }; /* * Maps errno values to nfs error numbers. * Although it is not obvious whether or not NFS clients really care if * a returned error value is in the specified list for the procedure, the * safest thing to do is filter them appropriately. For Version 2, the * X/Open XNFS document is the only specification that defines error values * for each RPC (The RFC simply lists all possible error values for all RPCs), * so I have decided to not do this for Version 2. * The first entry is the default error return and the rest are the valid * errors for that RPC in increasing numeric order. */ static short nfsv3err_null[] = { 0, 0, }; static short nfsv3err_getattr[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_setattr[] = { NFSERR_IO, NFSERR_PERM, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOT_SYNC, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_lookup[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_access[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_read[] = { NFSERR_IO, NFSERR_IO, NFSERR_NXIO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_write[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_INVAL, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_create[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mkdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_symlink[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_mknod[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, NFSERR_BADTYPE, 0, }; static short nfsv3err_remove[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rmdir[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_ROFS, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_rename[] = { NFSERR_IO, NFSERR_NOENT, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_ISDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_NOTEMPTY, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_link[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_EXIST, NFSERR_XDEV, NFSERR_NOTDIR, NFSERR_INVAL, NFSERR_NOSPC, NFSERR_ROFS, NFSERR_MLINK, NFSERR_NAMETOL, NFSERR_DQUOT, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_NOTSUPP, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdir[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_readdirplus[] = { NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_NOTDIR, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_BAD_COOKIE, NFSERR_NOTSUPP, NFSERR_TOOSMALL, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsstat[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_fsinfo[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_pathconf[] = { NFSERR_STALE, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short nfsv3err_commit[] = { NFSERR_IO, NFSERR_IO, NFSERR_STALE, NFSERR_BADHANDLE, NFSERR_SERVERFAULT, 0, }; static short *nfsrv_v3errmap[] = { nfsv3err_null, nfsv3err_getattr, nfsv3err_setattr, nfsv3err_lookup, nfsv3err_access, nfsv3err_readlink, nfsv3err_read, nfsv3err_write, nfsv3err_create, nfsv3err_mkdir, nfsv3err_symlink, nfsv3err_mknod, nfsv3err_remove, nfsv3err_rmdir, nfsv3err_rename, nfsv3err_link, nfsv3err_readdir, nfsv3err_readdirplus, nfsv3err_fsstat, nfsv3err_fsinfo, nfsv3err_pathconf, nfsv3err_commit, }; #endif /* NFS_NOSERVER */ extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsrtt nfsrtt; extern time_t nqnfsstarttime; extern int nqsrv_clockskew; extern int nqsrv_writeslack; extern int nqsrv_maxlease; extern struct nfsstats nfsstats; extern int nqnfs_piggy[NFS_NPROCS]; extern nfstype nfsv2_type[9]; extern nfstype nfsv3_type[9]; extern struct nfsnodehashhead *nfsnodehashtbl; extern u_long nfsnodehash; #ifdef VFS_LKM struct getfh_args; extern int getfh(struct proc *, struct getfh_args *, int *); struct nfssvc_args; extern int nfssvc(struct proc *, struct nfssvc_args *, int *); #endif LIST_HEAD(nfsnodehashhead, nfsnode); /* * Create the header for an rpc request packet * The hsiz is the size of the rest of the nfs request header. * (just used to decide if a cluster is a good idea) */ struct mbuf * nfsm_reqh(vp, procid, hsiz, bposp) struct vnode *vp; u_long procid; int hsiz; caddr_t *bposp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; struct mbuf *mb2; struct nfsmount *nmp; int nqflag; MGET(mb, M_WAIT, MT_DATA); if (hsiz >= MINCLSIZE) MCLGET(mb, M_WAIT); mb->m_len = 0; bpos = mtod(mb, caddr_t); /* * For NQNFS, add lease request. */ if (vp) { nmp = VFSTONFS(vp->v_mount); if (nmp->nm_flag & NFSMNT_NQNFS) { nqflag = NQNFS_NEEDLEASE(vp, procid); if (nqflag) { nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); *tl++ = txdr_unsigned(nqflag); *tl = txdr_unsigned(nmp->nm_leaseterm); } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = 0; } } } /* Finally, return values */ *bposp = bpos; return (mb); } /* * Build the RPC header and fill in the authorization info. * The authorization string argument is only used when the credentials * come from outside of the kernel. * Returns the head of the mbuf list. */ struct mbuf * nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len, verf_str, mrest, mrest_len, mbp, xidp) register struct ucred *cr; int nmflag; int procid; int auth_type; int auth_len; char *auth_str; int verf_len; char *verf_str; struct mbuf *mrest; int mrest_len; struct mbuf **mbp; u_long *xidp; { register struct mbuf *mb; register u_long *tl; register caddr_t bpos; register int i; struct mbuf *mreq, *mb2; int siz, grpsiz, authsiz; authsiz = nfsm_rndup(auth_len); MGETHDR(mb, M_WAIT, MT_DATA); if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) { MCLGET(mb, M_WAIT); } else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) { MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED); } else { MH_ALIGN(mb, 8 * NFSX_UNSIGNED); } mb->m_len = 0; mreq = mb; bpos = mtod(mb, caddr_t); /* * First the RPC header. */ nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED); if (++nfs_xid == 0) nfs_xid++; *tl++ = *xidp = txdr_unsigned(nfs_xid); *tl++ = rpc_call; *tl++ = rpc_vers; if (nmflag & NFSMNT_NQNFS) { *tl++ = txdr_unsigned(NQNFS_PROG); *tl++ = txdr_unsigned(NQNFS_VER3); } else { *tl++ = txdr_unsigned(NFS_PROG); if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(NFS_VER3); else *tl++ = txdr_unsigned(NFS_VER2); } if (nmflag & NFSMNT_NFSV3) *tl++ = txdr_unsigned(procid); else *tl++ = txdr_unsigned(nfsv2_procid[procid]); /* * And then the authorization cred. */ *tl++ = txdr_unsigned(auth_type); *tl = txdr_unsigned(authsiz); switch (auth_type) { case RPCAUTH_UNIX: nfsm_build(tl, u_long *, auth_len); *tl++ = 0; /* stamp ?? */ *tl++ = 0; /* NULL hostname */ *tl++ = txdr_unsigned(cr->cr_uid); *tl++ = txdr_unsigned(cr->cr_groups[0]); grpsiz = (auth_len >> 2) - 5; *tl++ = txdr_unsigned(grpsiz); for (i = 1; i <= grpsiz; i++) *tl++ = txdr_unsigned(cr->cr_groups[i]); break; case RPCAUTH_KERB4: siz = auth_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(auth_str, bpos, i); mb->m_len += i; auth_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } break; }; /* * And the verifier... */ nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); if (verf_str) { *tl++ = txdr_unsigned(RPCAUTH_KERB4); *tl = txdr_unsigned(verf_len); siz = verf_len; while (siz > 0) { if (M_TRAILINGSPACE(mb) == 0) { MGET(mb2, M_WAIT, MT_DATA); if (siz >= MINCLSIZE) MCLGET(mb2, M_WAIT); mb->m_next = mb2; mb = mb2; mb->m_len = 0; bpos = mtod(mb, caddr_t); } i = min(siz, M_TRAILINGSPACE(mb)); bcopy(verf_str, bpos, i); mb->m_len += i; verf_str += i; bpos += i; siz -= i; } if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { for (i = 0; i < siz; i++) *bpos++ = '\0'; mb->m_len += siz; } } else { *tl++ = txdr_unsigned(RPCAUTH_NULL); *tl = 0; } mb->m_next = mrest; mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; mreq->m_pkthdr.rcvif = (struct ifnet *)0; *mbp = mb; return (mreq); } /* * copies mbuf chain to the uio scatter/gather list */ int nfsm_mbuftouio(mrep, uiop, siz, dpos) struct mbuf **mrep; register struct uio *uiop; int siz; caddr_t *dpos; { register char *mbufcp, *uiocp; register int xfer, left, len; register struct mbuf *mp; long uiosiz, rem; int error = 0; mp = *mrep; mbufcp = *dpos; len = mtod(mp, caddr_t)+mp->m_len-mbufcp; rem = nfsm_rndup(siz)-siz; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EFBIG); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { while (len == 0) { mp = mp->m_next; if (mp == NULL) return (EBADRPC); mbufcp = mtod(mp, caddr_t); len = mp->m_len; } xfer = (left > len) ? len : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (mbufcp, uiocp, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(mbufcp, uiocp, xfer); else copyout(mbufcp, uiocp, xfer); left -= xfer; len -= xfer; mbufcp += xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } *dpos = mbufcp; *mrep = mp; if (rem > 0) { if (len < rem) error = nfs_adv(mrep, dpos, rem, len); else *dpos += rem; } return (error); } /* * copies a uio scatter/gather list to an mbuf chain... */ int nfsm_uiotombuf(uiop, mq, siz, bpos) register struct uio *uiop; struct mbuf **mq; int siz; caddr_t *bpos; { register char *uiocp; register struct mbuf *mp, *mp2; register int xfer, left, mlen; int uiosiz, clflg, rem; char *cp; if (siz > MLEN) /* or should it >= MCLBYTES ?? */ clflg = 1; else clflg = 0; rem = nfsm_rndup(siz)-siz; mp = mp2 = *mq; while (siz > 0) { if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) return (EINVAL); left = uiop->uio_iov->iov_len; uiocp = uiop->uio_iov->iov_base; if (left > siz) left = siz; uiosiz = left; while (left > 0) { mlen = M_TRAILINGSPACE(mp); if (mlen == 0) { MGET(mp, M_WAIT, MT_DATA); if (clflg) MCLGET(mp, M_WAIT); mp->m_len = 0; mp2->m_next = mp; mp2 = mp; mlen = M_TRAILINGSPACE(mp); } xfer = (left > mlen) ? mlen : left; #ifdef notdef /* Not Yet.. */ if (uiop->uio_iov->iov_op != NULL) (*(uiop->uio_iov->iov_op)) (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else #endif if (uiop->uio_segflg == UIO_SYSSPACE) bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); else copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); mp->m_len += xfer; left -= xfer; uiocp += xfer; uiop->uio_offset += xfer; uiop->uio_resid -= xfer; } if (uiop->uio_iov->iov_len <= siz) { uiop->uio_iovcnt--; uiop->uio_iov++; } else { uiop->uio_iov->iov_base += uiosiz; uiop->uio_iov->iov_len -= uiosiz; } siz -= uiosiz; } if (rem > 0) { if (rem > M_TRAILINGSPACE(mp)) { MGET(mp, M_WAIT, MT_DATA); mp->m_len = 0; mp2->m_next = mp; } cp = mtod(mp, caddr_t)+mp->m_len; for (left = 0; left < rem; left++) *cp++ = '\0'; mp->m_len += rem; *bpos = cp; } else *bpos = mtod(mp, caddr_t)+mp->m_len; *mq = mp; return (0); } /* * Help break down an mbuf chain by setting the first siz bytes contiguous * pointed to by returned val. * This is used by the macros nfsm_dissect and nfsm_dissecton for tough * cases. (The macros use the vars. dpos and dpos2) */ int nfsm_disct(mdp, dposp, siz, left, cp2) struct mbuf **mdp; caddr_t *dposp; int siz; int left; caddr_t *cp2; { register struct mbuf *mp, *mp2; register int siz2, xfer; register caddr_t p; mp = *mdp; while (left == 0) { *mdp = mp = mp->m_next; if (mp == NULL) return (EBADRPC); left = mp->m_len; *dposp = mtod(mp, caddr_t); } if (left >= siz) { *cp2 = *dposp; *dposp += siz; } else if (mp->m_next == NULL) { return (EBADRPC); } else if (siz > MHLEN) { panic("nfs S too big"); } else { MGET(mp2, M_WAIT, MT_DATA); mp2->m_next = mp->m_next; mp->m_next = mp2; mp->m_len -= left; mp = mp2; *cp2 = p = mtod(mp, caddr_t); bcopy(*dposp, p, left); /* Copy what was left */ siz2 = siz-left; p += left; mp2 = mp->m_next; /* Loop around copying up the siz2 bytes */ while (siz2 > 0) { if (mp2 == NULL) return (EBADRPC); xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; if (xfer > 0) { bcopy(mtod(mp2, caddr_t), p, xfer); NFSMADV(mp2, xfer); mp2->m_len -= xfer; p += xfer; siz2 -= xfer; } if (siz2 > 0) mp2 = mp2->m_next; } mp->m_len = siz; *mdp = mp2; *dposp = mtod(mp2, caddr_t); } return (0); } /* * Advance the position in the mbuf chain. */ int nfs_adv(mdp, dposp, offs, left) struct mbuf **mdp; caddr_t *dposp; int offs; int left; { register struct mbuf *m; register int s; m = *mdp; s = left; while (s < offs) { offs -= s; m = m->m_next; if (m == NULL) return (EBADRPC); s = m->m_len; } *mdp = m; *dposp = mtod(m, caddr_t)+offs; return (0); } /* * Copy a string into mbufs for the hard cases... */ int nfsm_strtmbuf(mb, bpos, cp, siz) struct mbuf **mb; char **bpos; char *cp; long siz; { register struct mbuf *m1 = 0, *m2; long left, xfer, len, tlen; u_long *tl; int putsize; putsize = 1; m2 = *mb; left = M_TRAILINGSPACE(m2); if (left > 0) { tl = ((u_long *)(*bpos)); *tl++ = txdr_unsigned(siz); putsize = 0; left -= NFSX_UNSIGNED; m2->m_len += NFSX_UNSIGNED; if (left > 0) { bcopy(cp, (caddr_t) tl, left); siz -= left; cp += left; m2->m_len += left; left = 0; } } /* Loop around adding mbufs */ while (siz > 0) { MGET(m1, M_WAIT, MT_DATA); if (siz > MLEN) MCLGET(m1, M_WAIT); m1->m_len = NFSMSIZ(m1); m2->m_next = m1; m2 = m1; tl = mtod(m1, u_long *); tlen = 0; if (putsize) { *tl++ = txdr_unsigned(siz); m1->m_len -= NFSX_UNSIGNED; tlen = NFSX_UNSIGNED; putsize = 0; } if (siz < m1->m_len) { len = nfsm_rndup(siz); xfer = siz; if (xfer < len) *(tl+(xfer>>2)) = 0; } else { xfer = len = m1->m_len; } bcopy(cp, (caddr_t) tl, xfer); m1->m_len = len+tlen; siz -= xfer; cp += xfer; } *mb = m1; *bpos = mtod(m1, caddr_t)+m1->m_len; return (0); } /* * Called once to initialize data structures... */ int nfs_init() { register int i; /* * Check to see if major data structures haven't bloated. */ if (sizeof (struct nfsnode) > NFS_NODEALLOC) { printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC); printf("Try reducing NFS_SMALLFH\n"); } if (sizeof (struct nfsmount) > NFS_MNTALLOC) { printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC); printf("Try reducing NFS_MUIDHASHSIZ\n"); } if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) { printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC); printf("Try reducing NFS_UIDHASHSIZ\n"); } if (sizeof (struct nfsuid) > NFS_UIDALLOC) { printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC); printf("Try unionizing the nu_nickname and nu_flag fields\n"); } nfsrtt.pos = 0; rpc_vers = txdr_unsigned(RPC_VER2); rpc_call = txdr_unsigned(RPC_CALL); rpc_reply = txdr_unsigned(RPC_REPLY); rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); rpc_mismatch = txdr_unsigned(RPC_MISMATCH); rpc_autherr = txdr_unsigned(RPC_AUTHERR); rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4); nfs_prog = txdr_unsigned(NFS_PROG); nqnfs_prog = txdr_unsigned(NQNFS_PROG); nfs_true = txdr_unsigned(TRUE); nfs_false = txdr_unsigned(FALSE); nfs_xdrneg1 = txdr_unsigned(-1); nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; /* Ensure async daemons disabled */ for (i = 0; i < NFS_MAXASYNCDAEMON; i++) nfs_iodwant[i] = (struct proc *)0; TAILQ_INIT(&nfs_bufq); nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ nfsrv_initcache(); /* Init the server request cache */ #endif /* * Initialize the nqnfs server stuff. */ if (nqnfsstarttime == 0) { nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + nqsrv_clockskew + nqsrv_writeslack; NQLOADNOVRAM(nqnfsstarttime); CIRCLEQ_INIT(&nqtimerhead); nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash); } /* * Initialize reply list and start timer */ TAILQ_INIT(&nfs_reqq); #ifndef NFS_NOSERVER nfs_timer(0); #endif #ifdef __FreeBSD__ /* * Set up lease_check and lease_updatetime so that other parts * of the system can call us, if we are loadable. */ #ifndef NFS_NOSERVER lease_check = nfs_lease_check; #endif lease_updatetime = nfs_lease_updatetime; vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */ #ifdef VFS_LKM sysent[SYS_nfssvc].sy_narg = 2; sysent[SYS_nfssvc].sy_call = nfssvc; #ifndef NFS_NOSERVER sysent[SYS_getfh].sy_narg = 2; sysent[SYS_getfh].sy_call = getfh; #endif #endif #endif return (0); } /* * Attribute cache routines. * nfs_loadattrcache() - loads or updates the cache contents from attributes * that are on the mbuf list * nfs_getattrcache() - returns valid attributes if found in cache, returns * error otherwise */ /* * Load the attribute cache (that lives in the nfsnode entry) with * the values on the mbuf list and * Iff vap not NULL * copy the attributes to *vaper */ int nfs_loadattrcache(vpp, mdp, dposp, vaper) struct vnode **vpp; struct mbuf **mdp; caddr_t *dposp; struct vattr *vaper; { register struct vnode *vp = *vpp; register struct vattr *vap; register struct nfs_fattr *fp; register struct nfsnode *np; register struct nfsnodehashhead *nhpp; register long t1; caddr_t cp2; int error = 0, rdev; struct mbuf *md; enum vtype vtyp; u_short vmode; struct timespec mtime; struct vnode *nvp; int v3 = NFS_ISV3(vp); md = *mdp; t1 = (mtod(md, caddr_t) + md->m_len) - *dposp; if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2)) return (error); fp = (struct nfs_fattr *)cp2; if (v3) { vtyp = nfsv3tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1), fxdr_unsigned(u_char, fp->fa3_rdev.specdata2)); fxdr_nfsv3time(&fp->fa3_mtime, &mtime); } else { vtyp = nfsv2tov_type(fp->fa_type); vmode = fxdr_unsigned(u_short, fp->fa_mode); /* * XXX * * The duplicate information returned in fa_type and fa_mode * is an ambiguity in the NFS version 2 protocol. * * VREG should be taken literally as a regular file. If a * server intents to return some type information differently * in the upper bits of the mode field (e.g. for sockets, or * FIFOs), NFSv2 mandates fa_type to be VNON. Anyway, we * leave the examination of the mode bits even in the VREG * case to avoid breakage for bogus servers, but we make sure * that there are actually type bits set in the upper part of * fa_mode (and failing that, trust the va_type field). * * NFSv3 cleared the issue, and requires fa_mode to not * contain any type information (while also introduing sockets * and FIFOs for fa_type). */ if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0)) vtyp = IFTOVT(vmode); rdev = fxdr_unsigned(long, fp->fa2_rdev); fxdr_nfsv2time(&fp->fa2_mtime, &mtime); /* * Really ugly NFSv2 kludge. */ if (vtyp == VCHR && rdev == 0xffffffff) vtyp = VFIFO; } /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); if (vp->v_type != vtyp) { /* * If we had a lock and it turns out that the vnode * is an object which we don't want to lock (e.g. VDIR) * to avoid nasty hanging problems on a server crash, * then release it here. */ if (vtyp != VREG && VOP_ISLOCKED(vp)) VOP_UNLOCK(vp); vp->v_type = vtyp; if (vp->v_type == VFIFO) { vp->v_op = fifo_nfsv2nodeop_p; } if (vp->v_type == VCHR || vp->v_type == VBLK) { vp->v_op = spec_nfsv2nodeop_p; nvp = checkalias(vp, (dev_t)rdev, vp->v_mount); if (nvp) { /* * Discard unneeded vnode, but save its nfsnode. */ LIST_REMOVE(np, n_hash); nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased node. */ np->n_vnode = nvp; nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize)); LIST_INSERT_HEAD(nhpp, np, n_hash); *vpp = vp = nvp; } } np->n_mtime = mtime.ts_sec; } vap = &np->n_vattr; vap->va_type = vtyp; vap->va_mode = (vmode & 07777); vap->va_rdev = (dev_t)rdev; vap->va_mtime = mtime; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; if (v3) { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); fxdr_hyper(&fp->fa3_size, &vap->va_size); vap->va_blocksize = NFS_FABLKSIZE; fxdr_hyper(&fp->fa3_used, &vap->va_bytes); vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]); fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime); fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime); vap->va_flags = 0; vap->va_filerev = 0; } else { vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); vap->va_size = fxdr_unsigned(u_long, fp->fa2_size); vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize); vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE; vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid); fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime); vap->va_flags = 0; vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec); vap->va_ctime.ts_nsec = 0; vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec); vap->va_filerev = 0; } if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } return (0); } /* * Check the time stamp * If the cache is valid, copy contents to *vap and return 0 * otherwise return an error */ int nfs_getattrcache(vp, vaper) register struct vnode *vp; struct vattr *vaper; { register struct nfsnode *np = VTONFS(vp); register struct vattr *vap; if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { nfsstats.attrcache_misses++; return (ENOENT); } nfsstats.attrcache_hits++; vap = &np->n_vattr; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (np->n_flag & NMODIFIED) { if (vap->va_size < np->n_size) vap->va_size = np->n_size; else np->n_size = vap->va_size; } else np->n_size = vap->va_size; vnode_pager_setsize(vp, (u_long)np->n_size); } else np->n_size = vap->va_size; } bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } return (0); } #ifndef NFS_NOSERVER /* * Set up nameidata for a lookup() call and do it */ int nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag) register struct nameidata *ndp; fhandle_t *fhp; int len; struct nfssvc_sock *slp; struct mbuf *nam; struct mbuf **mdp; caddr_t *dposp; struct vnode **retdirp; struct proc *p; int kerbflag; { register int i, rem; register struct mbuf *md; register char *fromcp, *tocp; struct vnode *dp; int error, rdonly; struct componentname *cnp = &ndp->ni_cnd; *retdirp = (struct vnode *)0; MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); /* * Copy the name from the mbuf list to ndp->ni_pnbuf * and set the various ndp fields appropriately. */ fromcp = *dposp; tocp = cnp->cn_pnbuf; md = *mdp; rem = mtod(md, caddr_t) + md->m_len - fromcp; cnp->cn_hash = 0; for (i = 0; i < len; i++) { while (rem == 0) { md = md->m_next; if (md == NULL) { error = EBADRPC; goto out; } fromcp = mtod(md, caddr_t); rem = md->m_len; } if (*fromcp == '\0' || *fromcp == '/') { error = EACCES; goto out; } cnp->cn_hash += (unsigned char)*fromcp; *tocp++ = *fromcp++; rem--; } *tocp = '\0'; *mdp = md; *dposp = fromcp; len = nfsm_rndup(len)-len; if (len > 0) { if (rem >= len) *dposp += len; else if (error = nfs_adv(mdp, dposp, len, rem)) goto out; } ndp->ni_pathlen = tocp - cnp->cn_pnbuf; cnp->cn_nameptr = cnp->cn_pnbuf; /* * Extract and set starting directory. */ if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, nam, &rdonly, kerbflag)) goto out; if (dp->v_type != VDIR) { nfsrv_vrele(dp); error = ENOTDIR; goto out; } VREF(dp); *retdirp = dp; ndp->ni_startdir = dp; if (rdonly) cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); else cnp->cn_flags |= NOCROSSMOUNT; /* * And call lookup() to do the real work */ cnp->cn_proc = p; if (error = lookup(ndp)) goto out; /* * Check for encountering a symbolic link */ if (cnp->cn_flags & ISSYMLINK) { if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vput(ndp->ni_vp); ndp->ni_vp = NULL; error = EINVAL; goto out; } nfsrv_vmio(ndp->ni_vp); /* * Check for saved name request */ if (cnp->cn_flags & (SAVENAME | SAVESTART)) { cnp->cn_flags |= HASBUF; return (0); } out: FREE(cnp->cn_pnbuf, M_NAMEI); return (error); } /* * A fiddled version of m_adj() that ensures null fill to a long * boundary and only trims off the back end */ void nfsm_adj(mp, len, nul) struct mbuf *mp; register int len; int nul; { register struct mbuf *m; register int count, i; register char *cp; /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ count = 0; m = mp; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len > len) { m->m_len -= len; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ for (m = mp; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (nul > 0) { cp = mtod(m, caddr_t)+m->m_len-nul; for (i = 0; i < nul; i++) *cp++ = '\0'; } break; } count -= m->m_len; } for (m = m->m_next;m;m = m->m_next) m->m_len = 0; } /* * Make these functions instead of macros, so that the kernel text size * doesn't get too big... */ void nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int before_ret; register struct vattr *before_vap; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; if (before_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED); *tl++ = nfs_true; txdr_hyper(&(before_vap->va_size), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_mtime), tl); tl += 2; txdr_nfsv3time(&(before_vap->va_ctime), tl); } *bposp = bpos; *mbp = mb; nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); } void nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp) struct nfsrv_descript *nfsd; int after_ret; struct vattr *after_vap; struct mbuf **mbp; char **bposp; { register struct mbuf *mb = *mbp, *mb2; register char *bpos = *bposp; register u_long *tl; register struct nfs_fattr *fp; if (after_ret) { nfsm_build(tl, u_long *, NFSX_UNSIGNED); *tl = nfs_false; } else { nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR); *tl++ = nfs_true; fp = (struct nfs_fattr *)tl; nfsm_srvfattr(nfsd, after_vap, fp); } *mbp = mb; *bposp = bpos; } void nfsm_srvfattr(nfsd, vap, fp) register struct nfsrv_descript *nfsd; register struct vattr *vap; register struct nfs_fattr *fp; { fp->fa_nlink = txdr_unsigned(vap->va_nlink); fp->fa_uid = txdr_unsigned(vap->va_uid); fp->fa_gid = txdr_unsigned(vap->va_gid); if (nfsd->nd_flag & ND_NFSV3) { fp->fa_type = vtonfsv3_type(vap->va_type); fp->fa_mode = vtonfsv3_mode(vap->va_mode); txdr_hyper(&vap->va_size, &fp->fa3_size); txdr_hyper(&vap->va_bytes, &fp->fa3_used); fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev)); fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev)); fp->fa3_fsid.nfsuquad[0] = 0; fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); fp->fa3_fileid.nfsuquad[0] = 0; fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid); txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); } else { fp->fa_type = vtonfsv2_type(vap->va_type); fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); fp->fa2_size = txdr_unsigned(vap->va_size); fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); if (vap->va_type == VFIFO) fp->fa2_rdev = 0xffffffff; else fp->fa2_rdev = txdr_unsigned(vap->va_rdev); fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); fp->fa2_fsid = txdr_unsigned(vap->va_fsid); fp->fa2_fileid = txdr_unsigned(vap->va_fileid); txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); } } /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) * - get vp and export rights by calling VFS_FHTOVP() * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon * - if not lockflag unlock it with VOP_UNLOCK() */ int nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag) fhandle_t *fhp; int lockflag; struct vnode **vpp; struct ucred *cred; struct nfssvc_sock *slp; struct mbuf *nam; int *rdonlyp; int kerbflag; { register struct mount *mp; register int i; struct ucred *credanon; int error, exflags; *vpp = (struct vnode *)0; mp = getvfs(&fhp->fh_fsid); if (!mp) return (ESTALE); error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon); if (error) return (error); /* * Check/setup credentials. */ if (exflags & MNT_EXKERB) { if (!kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } } else if (kerbflag) { vput(*vpp); return (NFSERR_AUTHERR | AUTH_TOOWEAK); } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { cred->cr_uid = credanon->cr_uid; for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) cred->cr_groups[i] = credanon->cr_groups[i]; cred->cr_ngroups = i; } if (exflags & MNT_EXRDONLY) *rdonlyp = 1; else *rdonlyp = 0; nfsrv_vmio(*vpp); if (!lockflag) VOP_UNLOCK(*vpp); return (0); } #endif /* NFS_NOSERVER */ /* * This function compares two net addresses by family and returns TRUE * if they are the same host. * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. */ int netaddr_match(family, haddr, nam) int family; union nethostaddr *haddr; struct mbuf *nam; { register struct sockaddr_in *inetaddr; switch (family) { case AF_INET: inetaddr = mtod(nam, struct sockaddr_in *); if (inetaddr->sin_family == AF_INET && inetaddr->sin_addr.s_addr == haddr->had_inetaddr) return (1); break; #ifdef ISO case AF_ISO: { register struct sockaddr_iso *isoaddr1, *isoaddr2; isoaddr1 = mtod(nam, struct sockaddr_iso *); isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); if (isoaddr1->siso_family == AF_ISO && isoaddr1->siso_nlen > 0 && isoaddr1->siso_nlen == isoaddr2->siso_nlen && SAME_ISOADDR(isoaddr1, isoaddr2)) return (1); break; } #endif /* ISO */ default: break; }; return (0); } static nfsuint64 nfs_nullcookie = { 0, 0 }; /* * This function finds the directory cookie that corresponds to the * logical byte offset given. */ nfsuint64 * nfs_getcookie(np, off, add) register struct nfsnode *np; off_t off; int add; { register struct nfsdmap *dp, *dp2; register int pos; pos = off / NFS_DIRBLKSIZ; if (pos == 0) { #ifdef DIAGNOSTIC if (add) panic("nfs getcookie add at 0"); #endif return (&nfs_nullcookie); } pos--; dp = np->n_cookies.lh_first; if (!dp) { if (add) { MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp->ndm_eocookie = 0; LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list); } else return ((nfsuint64 *)0); } while (pos >= NFSNUMCOOKIES) { pos -= NFSNUMCOOKIES; if (dp->ndm_list.le_next) { if (!add && dp->ndm_eocookie < NFSNUMCOOKIES && pos >= dp->ndm_eocookie) return ((nfsuint64 *)0); dp = dp->ndm_list.le_next; } else if (add) { MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap), M_NFSDIROFF, M_WAITOK); dp2->ndm_eocookie = 0; LIST_INSERT_AFTER(dp, dp2, ndm_list); dp = dp2; } else return ((nfsuint64 *)0); } if (pos >= dp->ndm_eocookie) { if (add) dp->ndm_eocookie = pos + 1; else return ((nfsuint64 *)0); } return (&dp->ndm_cookies[pos]); } /* * Invalidate cached directory information, except for the actual directory * blocks (which are invalidated separately). * Done mainly to avoid the use of stale offset cookies. */ void nfs_invaldir(vp) register struct vnode *vp; { register struct nfsnode *np = VTONFS(vp); #ifdef DIAGNOSTIC if (vp->v_type != VDIR) panic("nfs: invaldir not dir"); #endif np->n_direofoffset = 0; np->n_cookieverf.nfsuquad[0] = 0; np->n_cookieverf.nfsuquad[1] = 0; if (np->n_cookies.lh_first) np->n_cookies.lh_first->ndm_eocookie = 0; } /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT * flag. Once done the new write verifier can be set for the mount point. */ void nfs_clearcommit(mp) struct mount *mp; { register struct vnode *vp, *nvp; register struct buf *bp, *nbp; int s; s = splbio(); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { if (vp->v_mount != mp) /* Paranoia */ goto loop; nvp = vp->v_mntvnodes.le_next; for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) == (B_DELWRI | B_NEEDCOMMIT)) bp->b_flags &= ~B_NEEDCOMMIT; } } splx(s); } #ifndef NFS_NOSERVER /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. */ int nfsrv_errmap(nd, err) struct nfsrv_descript *nd; register int err; { register short *defaulterrp, *errp; if (nd->nd_flag & ND_NFSV3) { if (nd->nd_procnum <= NFSPROC_COMMIT) { errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum]; while (*++errp) { if (*errp == err) return (err); else if (*errp > err) break; } return ((int)*defaulterrp); } else return (err & 0xffff); } if (err <= ELAST) return ((int)nfsrv_v2errmap[err - 1]); return (NFSERR_IO); } int nfsrv_vmio(struct vnode *vp) { vm_object_t object; if ((vp == NULL) || (vp->v_type != VREG)) return 1; retry: if ((vp->v_flag & VVMIO) == 0) { struct vattr vat; struct proc *p = curproc; if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0) panic("nfsrv_vmio: VOP_GETATTR failed"); - (void) vnode_pager_alloc(vp, vat.va_size, 0, 0); + (void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0); vp->v_flag |= VVMIO; } else { if ((object = vp->v_object) && (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "nfdead", 0); goto retry; } if (!object) panic("nfsrv_vmio: VMIO object missing"); vm_object_reference(object); } return 0; } int nfsrv_vput(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vput(vp); vm_object_deallocate(vp->v_object); } else { vput(vp); } return 0; } int nfsrv_vrele(struct vnode *vp) { if ((vp->v_flag & VVMIO) && vp->v_object) { vrele(vp); vm_object_deallocate(vp->v_object); } else { vrele(vp); } return 0; } #endif /* NFS_NOSERVER */ Index: head/sys/sys/bio.h =================================================================== --- head/sys/sys/bio.h (revision 13489) +++ head/sys/sys/bio.h (revision 13490) @@ -1,248 +1,249 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.7 (Berkeley) 1/21/94 - * $Id: buf.h,v 1.25 1995/12/11 04:57:20 dyson Exp $ + * $Id: buf.h,v 1.26 1995/12/28 23:34:28 davidg Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #define NOLIST ((struct buf *)0x87654321) struct buf; struct iodone_chain { long ic_prev_flags; void (*ic_prev_iodone) __P((struct buf *)); void *ic_prev_iodone_chain; struct { long ia_long; void *ia_ptr; } ic_args[5]; }; typedef TAILQ_HEAD(buf_queue_head, buf) buf_queue_head, *buf_queue_head_t; /* * The buffer header describes an I/O operation in the kernel. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ struct buf *b_actf, **b_actb; /* Device driver queue when active. *depricated* XXX */ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ unsigned char b_usecount; /* buffer use count */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ struct { caddr_t b_addr; /* Memory, superblocks, indirect etc. */ } b_un; void *b_saveaddr; /* Original b_addr for physio. */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); /* For nested b_iodone's. */ struct iodone_chain *b_iodone_chain; struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ daddr_t b_pblkno; /* physical block number */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_spc; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[(MAXPHYS + PAGE_SIZE - 1)/PAGE_SIZE]; int b_npages; }; /* Device driver compatibility definitions. */ #define b_active b_bcount /* Driver queue head: drive active. */ #define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ #define b_errcnt b_resid /* Retry count while I/O in progress. */ #define iodone biodone /* Old name for biodone. */ #define iowait biowait /* Old name for biowait. */ /* * These flags are kept in b_flags. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_BUSY 0x00000010 /* I/O in progress. */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_GATHERED 0x00001000 /* LFS: already in a segment. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_TAPE 0x00200000 /* Magnetic tape I/O. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_BOUNCE 0x80000000 /* bounce buffer flag */ /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((unsigned long)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ extern LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ #define QUEUE_LRU 2 /* useful buffers */ #define QUEUE_VMIO 3 /* VMIO buffers */ #define QUEUE_AGE 4 /* not-useful buffers */ #define QUEUE_EMPTY 5 /* empty buffer headers*/ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL extern int nbuf; /* The number of buffer headers */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern TAILQ_HEAD(swqueue, buf) bswlist; __BEGIN_DECLS void bufinit __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void brelse __P((struct buf *)); +void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); struct buf * getpbuf __P((void)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); int allocbuf __P((struct buf *, int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); u_int minphys __P((struct buf *)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages(struct buf *); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); void relpbuf __P((struct buf *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); void reassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((void)); void vm_bounce_alloc __P((struct buf *)); void vm_bounce_free __P((struct buf *)); vm_offset_t vm_bounce_kva_alloc __P((int)); void vm_bounce_kva_alloc_free __P((vm_offset_t, int)); __END_DECLS #endif #endif /* !_SYS_BUF_H_ */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 13489) +++ head/sys/sys/buf.h (revision 13490) @@ -1,248 +1,249 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.7 (Berkeley) 1/21/94 - * $Id: buf.h,v 1.25 1995/12/11 04:57:20 dyson Exp $ + * $Id: buf.h,v 1.26 1995/12/28 23:34:28 davidg Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #define NOLIST ((struct buf *)0x87654321) struct buf; struct iodone_chain { long ic_prev_flags; void (*ic_prev_iodone) __P((struct buf *)); void *ic_prev_iodone_chain; struct { long ia_long; void *ia_ptr; } ic_args[5]; }; typedef TAILQ_HEAD(buf_queue_head, buf) buf_queue_head, *buf_queue_head_t; /* * The buffer header describes an I/O operation in the kernel. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ struct buf *b_actf, **b_actb; /* Device driver queue when active. *depricated* XXX */ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ struct proc *b_proc; /* Associated proc; NULL if kernel. */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ unsigned char b_usecount; /* buffer use count */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ struct { caddr_t b_addr; /* Memory, superblocks, indirect etc. */ } b_un; void *b_saveaddr; /* Original b_addr for physio. */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); /* For nested b_iodone's. */ struct iodone_chain *b_iodone_chain; struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ daddr_t b_pblkno; /* physical block number */ caddr_t b_savekva; /* saved kva for transfer while bouncing */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_spc; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[(MAXPHYS + PAGE_SIZE - 1)/PAGE_SIZE]; int b_npages; }; /* Device driver compatibility definitions. */ #define b_active b_bcount /* Driver queue head: drive active. */ #define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ #define b_errcnt b_resid /* Retry count while I/O in progress. */ #define iodone biodone /* Old name for biodone. */ #define iowait biowait /* Old name for biowait. */ /* * These flags are kept in b_flags. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_BUSY 0x00000010 /* I/O in progress. */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_GATHERED 0x00001000 /* LFS: already in a segment. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_TAPE 0x00200000 /* Magnetic tape I/O. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_BOUNCE 0x80000000 /* bounce buffer flag */ /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((unsigned long)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ extern LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ #define QUEUE_LRU 2 /* useful buffers */ #define QUEUE_VMIO 3 /* VMIO buffers */ #define QUEUE_AGE 4 /* not-useful buffers */ #define QUEUE_EMPTY 5 /* empty buffer headers*/ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL extern int nbuf; /* The number of buffer headers */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern TAILQ_HEAD(swqueue, buf) bswlist; __BEGIN_DECLS void bufinit __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void brelse __P((struct buf *)); +void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); struct buf * getpbuf __P((void)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); int allocbuf __P((struct buf *, int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); u_int minphys __P((struct buf *)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages(struct buf *); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); void relpbuf __P((struct buf *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); void reassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((void)); void vm_bounce_alloc __P((struct buf *)); void vm_bounce_free __P((struct buf *)); vm_offset_t vm_bounce_kva_alloc __P((int)); void vm_bounce_kva_alloc_free __P((vm_offset_t, int)); __END_DECLS #endif #endif /* !_SYS_BUF_H_ */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 13489) +++ head/sys/sys/vnode.h (revision 13490) @@ -1,431 +1,432 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 - * $Id: vnode.h,v 1.27 1995/12/17 21:23:44 phk Exp $ + * $Id: vnode.h,v 1.28 1995/12/25 07:24:13 bde Exp $ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; /* * Vnode tag types. * These are for the benefit of external programs only (e.g., pstat) * and should NEVER be inspected by the kernel. */ enum vtagtype { VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, VT_UNION, VT_MSDOSFS, VT_DEVFS }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ LIST_HEAD(buflists, buf); typedef int vop_t __P((void *)); struct vnode { u_long v_flag; /* vnode flags (see below) */ short v_usecount; /* reference count of users */ short v_writecount; /* reference count of writers */ long v_holdcnt; /* page & buffer references */ daddr_t v_lastr; /* last read (read-ahead) */ u_long v_id; /* capability identifier */ struct mount *v_mount; /* ptr to vfs we are in */ vop_t **v_op; /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ long v_numoutput; /* num of writes in progress */ enum vtype v_type; /* vnode type */ union { struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ struct socket *vu_socket; /* unix ipc (VSOCK) */ struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; struct nqlease *v_lease; /* Soft reference to lease */ daddr_t v_lastw; /* last write (write cluster) */ daddr_t v_cstart; /* start block of cluster */ daddr_t v_lasta; /* last allocation */ int v_clen; /* length of current cluster */ int v_ralen; /* Read-ahead length */ + int v_usage; /* Vnode usage counter */ daddr_t v_maxra; /* last readahead block */ void *v_object; /* Place to store VM object */ enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ }; #define v_mountedhere v_un.vu_mountedhere #define v_socket v_un.vu_socket #define v_specinfo v_un.vu_specinfo #define v_fifoinfo v_un.vu_fifoinfo /* * Vnode flags. */ #define VROOT 0x0001 /* root of its file system */ #define VTEXT 0x0002 /* vnode is a pure text prototype */ #define VSYSTEM 0x0004 /* vnode being used by kernel */ #define VOLOCK 0x0008 /* vnode is locked waiting for an object */ #define VOWANT 0x0010 /* a process is waiting for VOLOCK */ #define VXLOCK 0x0100 /* vnode is locked to change underlying type */ #define VXWANT 0x0200 /* process is waiting for vnode */ #define VBWAIT 0x0400 /* waiting for output to complete */ #define VALIASED 0x0800 /* vnode has an alias */ #define VDIROP 0x1000 /* LFS: vnode is involved in a directory op */ #define VVMIO 0x2000 /* VMIO flag */ #define VNINACT 0x4000 /* LFS: skip ufs_inactive() in lfs_vunref */ #define VAGE 0x8000 /* Insert vnode at head of free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ short va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ long va_fsid; /* file system id (dev for now) */ long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_cflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ /* * Flags for ioflag. */ #define IO_UNIT 0x01 /* do I/O as atomic unit */ #define IO_APPEND 0x02 /* append write to end */ #define IO_SYNC 0x04 /* do I/O synchronously */ #define IO_NODELOCKED 0x08 /* underlying node already locked */ #define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ #define IO_VMIO 0x20 /* data already in VMIO space */ /* * Modes. Some values same as Ixxx entries from inode.h for now. */ #define VSUID 04000 /* set user id on execution */ #define VSGID 02000 /* set group id on execution */ #define VSVTX 01000 /* save swapped text even after use */ #define VREAD 00400 /* read, write, execute permissions */ #define VWRITE 00200 #define VEXEC 00100 /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) #ifdef KERNEL /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closeure */ #define WRITECLOSE 0x0004 /* vflush: only close writeable files */ #define DOCLOSE 0x0008 /* vclean: close active files */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_SAVEMETA 0x0002 /* vinvalbuf: leave indirect blocks */ #ifdef DIAGNOSTIC #define HOLDRELE(vp) holdrele(vp) #define VATTR_NULL(vap) vattr_null(vap) #define VHOLD(vp) vhold(vp) #define VREF(vp) vref(vp) void holdrele __P((struct vnode *)); void vhold __P((struct vnode *)); #else #define HOLDRELE(vp) (vp)->v_holdcnt-- /* decrease buf or page ref */ #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #define VHOLD(vp) (vp)->v_holdcnt++ /* increase buf or page ref */ #define VREF(vp) (vp)->v_usecount++ /* increase reference */ #endif #define NULLVP ((struct vnode *)NULL) #ifdef VFS_LKM #define VNODEOP_SET(f) DATA_SET(MODVNOPS,f) #else #define VNODEOP_SET(f) DATA_SET(vfs_opv_descs_,f) #endif /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern int desiredvnodes; /* number of vnodes desired */ extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ /* * Macro/function to check for client cache inconsistency w.r.t. leasing. */ #define LEASE_READ 0x1 /* Check lease for readers */ #define LEASE_WRITE 0x2 /* Check lease for modifiers */ extern void (*lease_check) __P((struct vnode *vp, struct proc *p, struct ucred *ucred, int flag)); extern void (*lease_updatetime) __P((int deltat)); #ifdef NFS #ifdef NQNFS #define LEASE_CHECK(vp, p, cred, flag) lease_check((vp), (p), (cred), (flag)) #define LEASE_UPDATETIME(dt) lease_updatetime(dt) #else #define LEASE_CHECK(vp, p, cred, flag) #define LEASE_UPDATETIME(dt) #endif /* NQNFS */ #else #define LEASE_CHECK(vp, p, cred, flag) \ do { if(lease_check) lease_check((vp), (p), (cred), (flag)); } while(0) #define LEASE_UPDATETIME(dt) \ do { if(lease_updatetime) lease_updatetime(dt); } while(0) #endif /* NFS */ #endif /* KERNEL */ /* * Mods for exensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 #define VDESC_NOMAP_VPP 0x0100 #define VDESC_VPP_WILLRELE 0x0200 /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { int vdesc_offset; /* offset in vector--first for speed */ char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_proc_offset; /* proc location, if any */ int vdesc_componentname_offset; /* if any */ /* * Finally, we've got a list of private data (about each operation) * for each transport layer. (Support to manage this list is not * yet part of BSD.) */ caddr_t *vdesc_transports; }; #ifdef KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; /* * This macro is very helpful in defining those offsets in the vdesc struct. * * This is stolen from X11R4. I ingored all the fancy stuff for * Crays, so if you decide to port this to such a serious machine, * you might want to consult Intrisics.h's XtOffset{,Of,To}. */ #define VOPARG_OFFSET(p_type,field) \ ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) #define VOPARG_OFFSETOF(s_type,field) \ VOPARG_OFFSET(s_type*,field) #define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) /* * This structure is used to configure the new vnodeops vector. */ struct vnodeopv_entry_desc { struct vnodeop_desc *opve_op; /* which operation this is */ vop_t *opve_impl; /* code implementing this operation */ }; struct vnodeopv_desc { /* ptr to the ptr to the vector where op should go */ vop_t ***opv_desc_vector_p; struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ }; /* * A default routine which just returns an error. */ int vn_default_error __P((void)); /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; /* * VOCALL calls an op given an ops vector. We break it out because BSD's * vclean changes the ops vector and then wants to call ops with the old * vector. */ #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) /* * This call works for vnodes in the kernel. */ #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) #define VDESC(OP) (& __CONCAT(OP,_desc)) #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) /* * Finally, include the default set of vnode operations. */ #include /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct proc; struct stat; struct ucred; struct uio; struct vattr; struct vnode; struct vop_bwrite_args; int bdevvp __P((dev_t dev, struct vnode **vpp)); /* cache_* may belong in namei.h. */ void cache_enter __P((struct vnode *dvp, struct vnode *vp, struct componentname *cnp)); int cache_lookup __P((struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)); void cache_purge __P((struct vnode *vp)); void cache_purgevfs __P((struct mount *mp)); struct vnode * checkalias __P((struct vnode *vp, dev_t nvp_rdev, struct mount *mp)); int getnewvnode __P((enum vtagtype tag, struct mount *mp, vop_t **vops, struct vnode **vpp)); void insmntque __P((struct vnode *vp, struct mount *mp)); void vattr_null __P((struct vattr *vap)); int vcount __P((struct vnode *vp)); int vfinddev __P((dev_t dev, enum vtype type, struct vnode **vpp)); void vfs_opv_init __P((struct vnodeopv_desc **them)); int vget __P((struct vnode *vp, int lockflag)); void vgone __P((struct vnode *vp)); void vgoneall __P((struct vnode *vp)); int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, struct proc *p, int slpflag, int slptimeo)); int vn_bwrite __P((struct vop_bwrite_args *ap)); int vn_close __P((struct vnode *vp, int flags, struct ucred *cred, struct proc *p)); int vn_open __P((struct nameidata *ndp, int fmode, int cmode)); int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid, struct proc *p)); int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); void vn_vmio_close __P((struct vnode *vp)); int vn_writechk __P((struct vnode *vp)); void vprint __P((char *label, struct vnode *vp)); void vput __P((struct vnode *vp)); void vref __P((struct vnode *vp)); void vrele __P((struct vnode *vp)); #endif /* KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_balloc.c =================================================================== --- head/sys/ufs/ffs/ffs_balloc.c (revision 13489) +++ head/sys/ufs/ffs/ffs_balloc.c (revision 13490) @@ -1,291 +1,291 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 - * $Id: ffs_balloc.c,v 1.8 1995/05/28 04:32:23 davidg Exp $ + * $Id: ffs_balloc.c,v 1.9 1995/05/30 08:14:59 rgrimes Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Balloc defines the structure of file system storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. */ int ffs_balloc(ip, bn, size, cred, bpp, flags) register struct inode *ip; register daddr_t bn; int size; struct ucred *cred; struct buf **bpp; int flags; { register struct fs *fs; register daddr_t nb; struct buf *bp, *nbp; struct vnode *vp = ITOV(ip); struct indir indirs[NIADDR + 2]; daddr_t newb, lbn, *bap, pref; int osize, nsize, num, i, error; *bpp = NULL; if (bn < 0) return (EFBIG); fs = ip->i_fs; lbn = bn; /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ nb = lblkno(fs, ip->i_size); if (nb < NDADDR && nb < bn) { osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { error = ffs_realloccg(ip, nb, ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]), osize, (int)fs->fs_bsize, cred, &bp); if (error) return (error); ip->i_size = (nb + 1) * fs->fs_bsize; ip->i_db[nb] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (flags & B_SYNC) bwrite(bp); else bawrite(bp); } } /* * The first NDADDR blocks are direct blocks */ if (bn < NDADDR) { nb = ip->i_db[bn]; if (nb != 0 && ip->i_size >= (bn + 1) * fs->fs_bsize) { error = bread(vp, bn, fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { error = bread(vp, bn, osize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); } else { error = ffs_realloccg(ip, bn, ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), osize, nsize, cred, &bp); if (error) return (error); } } else { if (ip->i_size < (bn + 1) * fs->fs_bsize) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; error = ffs_alloc(ip, bn, ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), nsize, cred, &newb); if (error) return (error); bp = getblk(vp, bn, nsize, 0, 0); bp->b_blkno = fsbtodb(fs, newb); if (flags & B_CLRBUF) vfs_bio_clrbuf(bp); } ip->i_db[bn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; error = ufs_getlbns(vp, bn, indirs, &num); if (error) return(error); #ifdef DIAGNOSTIC if (num < 1) panic ("ffs_balloc: ufs_bmaparray returned indirect block"); #endif /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ip->i_ib[indirs[0].in_off]; if (nb == 0) { pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb); if (error) return (error); nb = newb; bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0); bp->b_blkno = fsbtodb(fs, newb); vfs_bio_clrbuf(bp); /* * Write synchronously so that indirect blocks * never point at garbage. */ error = bwrite(bp); if (error) { ffs_blkfree(ip, nb, fs->fs_bsize); return (error); } ip->i_ib[indirs[0].in_off] = newb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bap = (daddr_t *)bp->b_data; nb = bap[indirs[i].in_off]; if (i == num) break; i += 1; if (nb != 0) { - brelse(bp); + bqrelse(bp); continue; } if (pref == 0) pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb); if (error) { brelse(bp); return (error); } nb = newb; nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); /* * Write synchronously so that indirect blocks * never point at garbage. */ error = bwrite(nbp); if (error) { ffs_blkfree(ip, nb, fs->fs_bsize); brelse(bp); return (error); } bap[indirs[i - 1].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } } /* * Get the data block, allocating if necessary. */ if (nb == 0) { pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb); if (error) { brelse(bp); return (error); } nb = newb; nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); if (flags & B_CLRBUF) vfs_bio_clrbuf(nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & B_SYNC) { bwrite(bp); } else { bdwrite(bp); } *bpp = nbp; return (0); } brelse(bp); if (flags & B_CLRBUF) { error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); if (error) { brelse(nbp); return (error); } } else { nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); } *bpp = nbp; return (0); } Index: head/sys/ufs/ffs/ffs_inode.c =================================================================== --- head/sys/ufs/ffs/ffs_inode.c (revision 13489) +++ head/sys/ufs/ffs/ffs_inode.c (revision 13490) @@ -1,522 +1,522 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 - * $Id: ffs_inode.c,v 1.18 1995/12/11 04:57:37 dyson Exp $ + * $Id: ffs_inode.c,v 1.19 1996/01/05 18:31:48 wollman Exp $ */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ffs_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int, long *)); int ffs_init() { return (ufs_init()); } /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. The IN_MODIFIED * flag is used to specify that the inode needs to be updated even if none * of the times needs to be updated. The access and modified times are taken * from the second and third parameters; the inode change time is always * taken from the current time. If waitfor is set, then wait for the disk * write of the inode to complete. */ int ffs_update(ap) struct vop_update_args /* { struct vnode *a_vp; struct timeval *a_access; struct timeval *a_modify; int a_waitfor; } */ *ap; { register struct fs *fs; struct buf *bp; struct inode *ip; int error; time_t tv_sec; ip = VTOI(ap->a_vp); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); return (0); } if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) return (0); /* * Use a copy of the current time to get consistent timestamps * (a_access and a_modify are sometimes aliases for &time). * * XXX in 2.0, a_access and a_modify are often pointers to the * same copy of `time'. This is not as good. Some callers forget * to make a copy; others make a copy too early (before the i/o * has completed)... * * XXX there should be a function or macro for reading the time * (e.g., some machines may require splclock()). */ tv_sec = time.tv_sec; if (ip->i_flag & IN_ACCESS) ip->i_atime.ts_sec = (ap->a_access == &time ? tv_sec : ap->a_access->tv_sec); if (ip->i_flag & IN_UPDATE) { ip->i_mtime.ts_sec = (ap->a_modify == &time ? tv_sec : ap->a_modify->tv_sec); ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) ip->i_ctime.ts_sec = tv_sec; ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); fs = ip->i_fs; /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_din.di_ouid = ip->i_uid; /* XXX */ ip->i_din.di_ogid = ip->i_gid; /* XXX */ } /* XXX */ error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0) return (bwrite(bp)); else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); return (0); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ int ffs_truncate(ap) struct vop_truncate_args /* { struct vnode *a_vp; off_t a_length; int a_flags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *ovp = ap->a_vp; register daddr_t lastblock; register struct inode *oip; daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; off_t length = ap->a_length; register struct fs *fs; struct buf *bp; int offset, size, level; long count, nblocks, vflags, blocksreleased = 0; struct timeval tv; register int i; int aflags, error, allerror; off_t osize; oip = VTOI(ovp); fs = oip->i_fs; if (length < 0 || length > fs->fs_maxfilesize) return (EINVAL); tv = time; if (ovp->v_type == VLNK && (oip->i_size < ovp->v_mount->mnt_maxsymlinklen || oip->i_din.di_blocks == 0)) { #ifdef DIAGNOSTIC if (length != 0) panic("ffs_truncate: partial truncate of symlink"); #endif bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); oip->i_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 0)); } #ifdef QUOTA error = getinoquota(oip); if (error) return (error); #endif osize = oip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; + vnode_pager_setsize(ovp, length); error = ffs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, aflags); if (error) return (error); oip->i_size = length; if (aflags & B_SYNC) bwrite(bp); else if (ovp->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else bawrite(bp); - vnode_pager_setsize(ovp, length); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, &tv, &tv, 1)); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundry, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessable again because * of subsequent file growth. */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); aflags = B_CLRBUF; if (ap->a_flags & IO_SYNC) aflags |= B_SYNC; error = ffs_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags); if (error) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (aflags & B_SYNC) bwrite(bp); else if (ovp->v_mount->mnt_flag & MNT_ASYNC) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ffs_indirtrunc below. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks); for (level = TRIPLE; level >= SINGLE; level--) if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } for (i = NDADDR - 1; i > lastblock; i--) oip->i_db[i] = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; - error = VOP_UPDATE(ovp, &tv, &tv, 0); + error = VOP_UPDATE(ovp, &tv, &tv, ((length > 0) ? 0 : 1)); if (error) allerror = error; /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); oip->i_size = osize; vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); /* * Indirect blocks first. */ indir_lbn[SINGLE] = -NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ffs_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ffs_blkfree(oip, bn, fs->fs_bsize); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = NDADDR - 1; i > lastblock; i--) { register long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ffs_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("ffs_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ffs_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef DIAGNOSTIC for (level = SINGLE; level <= TRIPLE; level++) if (newblks[NDADDR + level] != oip->i_ib[level]) panic("ffs_truncate1"); for (i = 0; i < NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("ffs_truncate2"); if (length == 0 && (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) panic("ffs_truncate3"); #endif /* DIAGNOSTIC */ /* * Put back the real size. */ oip->i_size = length; oip->i_blocks -= blocksreleased; if (oip->i_blocks < 0) /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; vnode_pager_setsize(ovp, length); #ifdef QUOTA (void) chkdq(oip, -blocksreleased, NOCRED, 0); #endif return (allerror); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) register struct inode *ip; daddr_t lbn, lastbn; daddr_t dbn; int level; long *countp; { register int i; struct buf *bp; register struct fs *fs = ip->i_fs; register daddr_t *bap; struct vnode *vp; daddr_t *copy, nb, nlbn, last; long blkcount, factor; int nblocks, blocksreleased = 0; int error = 0, allerror = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ bp->b_flags |= B_READ; if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); error = biowait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (daddr_t *)bp->b_data; MALLOC(copy, daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); bzero((caddr_t)&bap[last + 1], (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; if ((vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { error = bwrite(bp); } else { bawrite(bp); error = 0; } if (error) allerror = error; bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount); if (error) allerror = error; blocksreleased += blkcount; } ffs_blkfree(ip, nb, fs->fs_bsize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount); if (error) allerror = error; blocksreleased += blkcount; } } FREE(copy, M_TEMP); *countp = blocksreleased; return (allerror); } Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 13489) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 13490) @@ -1,1047 +1,1047 @@ /* * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 - * $Id: ffs_vfsops.c,v 1.33 1996/01/05 18:31:49 wollman Exp $ + * $Id: ffs_vfsops.c,v 1.34 1996/01/14 18:54:59 bde Exp $ */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ffs_sbupdate __P((struct ufsmount *, int)); static int ffs_reload __P((struct mount *,struct ucred *,struct proc *)); static int ffs_oldfscompat __P((struct fs *)); static int ffs_mount __P((struct mount *, char *, caddr_t, struct nameidata *, struct proc *)); struct vfsops ufs_vfsops = { ffs_mount, ufs_start, ffs_unmount, ufs_root, ufs_quotactl, ffs_statfs, ffs_sync, ffs_vget, ffs_fhtovp, ffs_vptofh, ffs_init, }; VFS_SET(ufs_vfsops, ufs, MOUNT_UFS, 0); extern u_long nextgennumber; /* * ffs_mount * * Called when mounting local physical media * * PARAMETERS: * mountroot * mp mount point structure * path NULL (flag for root mount!!!) * data * ndp * p process (user credentials check [statfs]) * * mount * mp mount point structure * path path to mount point * data pointer to argument struct in user space * ndp mount point namei() return (used for * credentials on reload), reused to look * up block device. * p process (user credentials check) * * RETURNS: 0 Success * !0 error number (errno.h) * * LOCK STATE: * * ENTRY * mount point is locked * EXIT * mount point is locked * * NOTES: * A NULL path can be used for a flag since the mount * system call will fail with EFAULT in copyinstr in * namei() if it is a genuine NULL from the user. */ static int ffs_mount( mp, path, data, ndp, p) register struct mount *mp; /* mount struct pointer*/ char *path; /* path to mount point*/ caddr_t data; /* arguments to FS specific mount*/ struct nameidata *ndp; /* mount point credentials*/ struct proc *p; /* process requesting mount*/ { u_int size; int err = 0; struct vnode *devvp; struct ufs_args args; struct ufsmount *ump = 0; register struct fs *fs; int flags; /* * Use NULL path to flag a root mount */ if( path == NULL) { /* *** * Mounting root file system *** */ /* Get vnode for root device*/ if( bdevvp( rootdev, &rootvp)) panic("ffs_mountroot: can't setup bdevvp for root"); /* * FS specific handling */ mp->mnt_flag |= MNT_RDONLY; /* XXX globally applicable?*/ /* * Attempt mount */ if( ( err = ffs_mountfs(rootvp, mp, p)) != 0) { /* fs specific cleanup (if any)*/ goto error_1; } goto dostatfs; /* success*/ } /* *** * Mounting non-root file system or updating a file system *** */ /* copy in user arguments*/ err = copyin(data, (caddr_t)&args, sizeof (struct ufs_args)); if (err) goto error_1; /* can't get arguments*/ /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; err = 0; if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (vfs_busy(mp)) { err = EBUSY; goto error_1; } err = ffs_flushfiles(mp, flags, p); vfs_unbusy(mp); } if (!err && (mp->mnt_flag & MNT_RELOAD)) err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); if (err) { goto error_1; } if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) { if (!fs->fs_clean) { if (mp->mnt_flag & MNT_FORCE) { printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt); } else { printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n", fs->fs_fsmnt); err = EPERM; goto error_1; } } fs->fs_ronly = 0; } if (fs->fs_ronly == 0) { fs->fs_clean = 0; ffs_sbupdate(ump, MNT_WAIT); } /* if not updating name...*/ if (args.fspec == 0) { /* * Process export requests. Jumping to "success" * will return the vfs_export() error code. */ err = vfs_export(mp, &ump->um_export, &args.export); goto success; } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); err = namei(ndp); if (err) { /* can't get devvp!*/ goto error_1; } devvp = ndp->ni_vp; if (devvp->v_type != VBLK) { err = ENOTBLK; goto error_2; } if (major(devvp->v_rdev) >= nblkdev) { err = ENXIO; goto error_2; } if (mp->mnt_flag & MNT_UPDATE) { /* ******************** * UPDATE ******************** */ if (devvp != ump->um_devvp) err = EINVAL; /* needs translation */ else vrele(devvp); /* * Update device name only on success */ if( !err) { /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); } } else { /* ******************** * NEW MOUNT ******************** */ /* * Since this is a new mount, we want the names for * the device and the mount point copied in. If an * error occurs, the mountpoint is discarded by the * upper level code. */ /* Save "last mounted on" info for mount point (NULL pad)*/ copyinstr( path, /* mount point*/ mp->mnt_stat.f_mntonname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size); /* Save "mounted from" info for mount point (NULL pad)*/ copyinstr( args.fspec, /* device name*/ mp->mnt_stat.f_mntfromname, /* save area*/ MNAMELEN - 1, /* max size*/ &size); /* real size*/ bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); err = ffs_mountfs(devvp, mp, p); } if (err) { goto error_2; } dostatfs: /* * Initialize FS stat information in mount struct; uses both * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname * * This code is common to root and non-root mounts */ (void)VFS_STATFS(mp, &mp->mnt_stat, p); goto success; error_2: /* error with devvp held*/ /* release devvp before failing*/ vrele(devvp); error_1: /* no state to back out*/ success: return( err); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(mp, cred, p) register struct mount *mp; struct ucred *cred; struct proc *p; { register struct vnode *vp, *nvp, *devvp; struct inode *ip; struct csum *space; struct buf *bp; struct fs *fs; int i, blks, size, error; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; if (vinvalbuf(devvp, 0, cred, p, 0, 0)) panic("ffs_reload: dirty1"); /* * Step 2: re-read superblock from disk. */ error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp); if (error) return (error); fs = (struct fs *)bp->b_data; if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || fs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOUFS(mp)->um_fs; bcopy(&fs->fs_csp[0], &((struct fs *)bp->b_data)->fs_csp[0], sizeof(fs->fs_csp)); bcopy(bp->b_data, fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; brelse(bp); ffs_oldfscompat(fs); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp[0]; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, fs->fs_csp[fragstoblks(fs, i)], (u_int)size); brelse(bp); } loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { nvp = vp->v_mntvnodes.le_next; /* * Step 4: invalidate all inactive vnodes. */ if (vp->v_usecount == 0) { vgone(vp); continue; } /* * Step 5: invalidate all cached file data. */ if (vget(vp, 1)) goto loop; if (vinvalbuf(vp, 0, cred, p, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 6: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); return (error); } ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); brelse(bp); vput(vp); if (vp->v_mount != mp) goto loop; } return (0); } /* * Common code for mount and mountroot */ int ffs_mountfs(devvp, mp, p) register struct vnode *devvp; struct mount *mp; struct proc *p; { register struct ufsmount *ump; struct buf *bp; register struct fs *fs; dev_t dev = devvp->v_rdev; struct partinfo dpart; caddr_t base, space; int havepart = 0, blks; int error, i, size; int ronly; u_int strsize; /* * Disallow multiple mounts of the same device. * Disallow mounting of a device that is currently in use * (except for root, which might share swap device for miniroot). * Flush out any old buffers remaining from a previous use. */ error = vfs_mountedon(devvp); if (error) return (error); if (vcount(devvp) > 1 && devvp != rootvp) return (EBUSY); error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0); if (error) return (error); ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); if (error) return (error); if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) size = DEV_BSIZE; else { havepart = 1; size = dpart.disklab->d_secsize; } bp = NULL; ump = NULL; error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp); if (error) goto out; fs = (struct fs *)bp->b_data; if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || fs->fs_bsize < sizeof(struct fs)) { error = EINVAL; /* XXX needs translation */ goto out; } if (!fs->fs_clean) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt); } else { printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n",fs->fs_fsmnt); error = EPERM; goto out; } } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); bzero((caddr_t)ump, sizeof *ump); ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; brelse(bp); bp = NULL; fs = ump->um_fs; fs->fs_ronly = ronly; if (ronly == 0) { fs->fs_fmod = 1; fs->fs_clean = 0; } blks = howmany(fs->fs_cssize, fs->fs_fsize); base = space = malloc((u_long)fs->fs_cssize, M_UFSMNT, M_WAITOK); for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) { free(base, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space; space += size; brelse(bp); bp = NULL; } mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = MOUNT_UFS; mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; devvp->v_specflags |= SI_MOUNTEDON; ffs_oldfscompat(fs); /* * Set FS local "last mounted on" information (NULL pad) */ copystr( mp->mnt_stat.f_mntonname, /* mount point*/ fs->fs_fsmnt, /* copy area*/ sizeof(fs->fs_fsmnt) - 1, /* max size*/ &strsize); /* real size*/ bzero( fs->fs_fsmnt + strsize, sizeof(fs->fs_fsmnt) - strsize); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) ffs_sbupdate(ump, MNT_WAIT); return (0); out: if (bp) brelse(bp); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); if (ump) { free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } /* * Sanity checks for old file systems. * * XXX - goes away some day. */ static int ffs_oldfscompat(fs) struct fs *fs; { fs->fs_npsect = max(fs->fs_npsect, fs->fs_nsect); /* XXX */ fs->fs_interleave = max(fs->fs_interleave, 1); /* XXX */ if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ fs->fs_nrpos = 8; /* XXX */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ #if 0 int i; /* XXX */ quad_t sizepb = fs->fs_bsize; /* XXX */ fs->fs_maxfilesize = fs->fs_bsize * NDADDR - 1; /* XXX */ for (i = 0; i < NIADDR; i++) { /* XXX */ sizepb *= NINDIR(fs); /* XXX */ fs->fs_maxfilesize += sizepb; /* XXX */ } /* XXX */ #endif fs->fs_maxfilesize = (u_quad_t) 1LL << 39; fs->fs_qbmask = ~fs->fs_bmask; /* XXX */ fs->fs_qfmask = ~fs->fs_fmask; /* XXX */ } /* XXX */ return (0); } /* * unmount system call */ int ffs_unmount(mp, mntflags, p) struct mount *mp; int mntflags; struct proc *p; { register struct ufsmount *ump; register struct fs *fs; int error, flags, ronly; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } error = ffs_flushfiles(mp, flags, p); if (error) return (error); ump = VFSTOUFS(mp); fs = ump->um_fs; ronly = fs->fs_ronly; if (!ronly) { fs->fs_clean = 1; ffs_sbupdate(ump, MNT_WAIT); } ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); /* vrele(ump->um_devvp); */ vn_vmio_close(ump->um_devvp); free(fs->fs_csp[0], M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; mp->mnt_flag &= ~MNT_LOCAL; return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, p) register struct mount *mp; int flags; struct proc *p; { register struct ufsmount *ump; int error; if (!doforce) flags &= ~FORCECLOSE; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, NULLVP, SKIPSYSTEM|flags); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) continue; quotaoff(p, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif error = vflush(mp, NULLVP, flags); return (error); } /* * Get file system statistics. */ int ffs_statfs(mp, sbp, p) struct mount *mp; register struct statfs *sbp; struct proc *p; { register struct ufsmount *ump; register struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_MAGIC) panic("ffs_statfs"); sbp->f_type = MOUNT_UFS; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree; sbp->f_bavail = freespace(fs, fs->fs_minfree); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree; if (sbp != &mp->mnt_stat) { bcopy((caddr_t)mp->mnt_stat.f_mntonname, (caddr_t)&sbp->f_mntonname[0], MNAMELEN); bcopy((caddr_t)mp->mnt_stat.f_mntfromname, (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); } return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ int ffs_sync(mp, waitfor, cred, p) struct mount *mp; int waitfor; struct ucred *cred; struct proc *p; { register struct vnode *vp, *nvp; register struct inode *ip; register struct ufsmount *ump = VFSTOUFS(mp); register struct fs *fs; struct timeval tv; int error, allerror = 0; fs = ump->um_fs; /* * Write back modified superblock. * Consistency check that the superblock * is still in the buffer cache. */ if (fs->fs_fmod != 0) { if (fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("update: rofs mod"); } fs->fs_fmod = 0; fs->fs_time = time.tv_sec; allerror = ffs_sbupdate(ump, waitfor); } /* * Write back each (modified) inode. */ loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * If the vnode that we are about to sync is no longer * associated with this mount point, start over. */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; if (VOP_ISLOCKED(vp)) continue; ip = VTOI(vp); if ((((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)) && vp->v_dirtyblkhd.lh_first == NULL) continue; if (vp->v_type != VCHR) { if (vget(vp, 1)) goto loop; error = VOP_FSYNC(vp, cred, waitfor, p); if (error) allerror = error; vput(vp); } else { tv = time; /* VOP_UPDATE(vp, &tv, &tv, waitfor == MNT_WAIT); */ VOP_UPDATE(vp, &tv, &tv, 0); } } /* * Force stale file system control information to be flushed. */ error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p); if (error) allerror = error; #ifdef QUOTA qsync(mp); #endif return (allerror); } /* * Look up a FFS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ffs_inode_hash_lock; int ffs_vget(mp, ino, vpp) struct mount *mp; ino_t ino; struct vnode **vpp; { register struct fs *fs; register struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; dev_t dev; int type, error; ump = VFSTOUFS(mp); dev = ump->um_dev; restart: if ((*vpp = ufs_ihashget(dev, ino)) != NULL) return (0); /* * Lock out the creation of new entries in the FFS hash table in * case getnewvnode() or MALLOC() blocks, otherwise a duplicate * may occur! */ if (ffs_inode_hash_lock) { while (ffs_inode_hash_lock) { ffs_inode_hash_lock = -1; tsleep(&ffs_inode_hash_lock, PVM, "ffsvgt", 0); } goto restart; } ffs_inode_hash_lock = 1; /* Allocate a new vnode/inode. */ error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp); if (error) { if (ffs_inode_hash_lock < 0) wakeup(&ffs_inode_hash_lock); ffs_inode_hash_lock = 0; *vpp = NULL; return (error); } type = ump->um_devvp->v_tag == VT_MFS ? M_MFSNODE : M_FFSNODE; /* XXX */ MALLOC(ip, struct inode *, sizeof(struct inode), type, M_WAITOK); bzero((caddr_t)ip, sizeof(struct inode)); vp->v_data = ip; ip->i_vnode = vp; ip->i_fs = fs = ump->um_fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif /* * Put it onto its hash chain and lock it so that other requests for * this inode will block if they arrive while we are sleeping waiting * for old data structures to be purged or for the contents of the * disk portion of this inode to be read. */ ufs_ihashins(ip); if (ffs_inode_hash_lock < 0) wakeup(&ffs_inode_hash_lock); ffs_inode_hash_lock = 0; /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ - vput(vp); brelse(bp); + vput(vp); *vpp = NULL; return (error); } ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); - brelse(bp); + bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, ffs_specop_p, FFS_FIFOOPS, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization now that aliasing has been resolved. */ ip->i_devvp = ump->um_devvp; VREF(ip->i_devvp); /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { if (++nextgennumber < (u_long)time.tv_sec) nextgennumber = time.tv_sec; ip->i_gen = nextgennumber; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) ip->i_flag |= IN_MODIFIED; } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din.di_ouid; /* XXX */ ip->i_gid = ip->i_din.di_ogid; /* XXX */ } /* XXX */ *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ int ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) register struct mount *mp; struct fid *fhp; struct mbuf *nam; struct vnode **vpp; int *exflagsp; struct ucred **credanonp; { register struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); } /* * Vnode pointer to File handle */ /* ARGSUSED */ int ffs_vptofh(vp, fhp) struct vnode *vp; struct fid *fhp; { register struct inode *ip; register struct ufid *ufhp; ip = VTOI(vp); ufhp = (struct ufid *)fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Write a superblock and associated information back to disk. */ static int ffs_sbupdate(mp, waitfor) struct ufsmount *mp; int waitfor; { register struct fs *fs = mp->um_fs; register struct buf *bp; int blks; caddr_t space; int i, size, error = 0; bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); /* Restore compatibility to old file systems. XXX */ if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ ((struct fs *)bp->b_data)->fs_nrpos = -1; /* XXX */ if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); blks = howmany(fs->fs_cssize, fs->fs_fsize); space = (caddr_t)fs->fs_csp[0]; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0); bcopy(space, bp->b_data, (u_int)size); space += size; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } return (error); } Index: head/sys/ufs/ufs/ufs_bmap.c =================================================================== --- head/sys/ufs/ufs/ufs_bmap.c (revision 13489) +++ head/sys/ufs/ufs/ufs_bmap.c (revision 13490) @@ -1,317 +1,317 @@ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 - * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $ + * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Bmap converts a the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct vnode **a_vpp; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_vpp != NULL) *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; if (ap->a_bnp == NULL) return (0); return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, ap->a_runp, ap->a_runb)); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb) struct vnode *vp; register daddr_t bn; daddr_t *bnp; struct indir *ap; int *nump; int *runp; int *runb; { register struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; struct vnode *devvp; struct indir a[NIADDR+1], *xap; daddr_t daddr; long metalbn; int error, maxrun = 0, num; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); #ifdef DIAGNOSTIC if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) panic("ufs_bmaparray: invalid arguments"); #endif if (runp) { /* * XXX * If MAXPHYS is the largest transfer the disks can handle, * we probably want maxrun to be 1 block less so that we * don't create a block larger than the device can handle. */ *runp = 0; maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1; } if (runb) { *runb = 0; } xap = ap == NULL ? a : ap; if (!nump) nump = # error = ufs_getlbns(vp, bn, xap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) *bnp = -1; else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn+1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[xap->in_off]; devvp = VFSTOUFS(vp->v_mount)->um_devvp; for (bp = NULL, ++xap; --num; ++xap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = xap->in_lbn; if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) - brelse(bp); + bqrelse(bp); xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef DIAGNOSTIC if (!daddr) panic("ufs_bmaparry: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; vfs_busy_pages(bp, 0); VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ error = biowait(bp); if (error) { brelse(bp); return (error); } } daddr = ((daddr_t *)bp->b_data)[xap->in_off]; if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], ((daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = xap->in_off; if (runb && bn) { for(--bn; bn > 0 && *runb < maxrun && is_sequential(ump, ((daddr_t *)bp->b_data)[bn], ((daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } } if (bp) - brelse(bp); + bqrelse(bp); daddr = blkptrtodb(ump, daddr); *bnp = daddr == 0 ? -1 : daddr; return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; register daddr_t bn; struct indir *ap; int *nump; { long metalbn, realbn; struct ufsmount *ump; int blockcnt, i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; /* The first NDADDR blocks are direct blocks. */ if (bn < NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); blockcnt *= MNINDIR(ump); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) metalbn = -(realbn - bn + NIADDR - i); else metalbn = -(-realbn - bn + NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; ap->in_off = off = NIADDR - i; ap->in_exists = 0; ap++; for (++numlevels; i <= NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(ump); off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ap->in_exists = 0; ++ap; metalbn -= -1 + off * blockcnt; } if (nump) *nump = numlevels; return (0); } Index: head/sys/ufs/ufs/ufs_readwrite.c =================================================================== --- head/sys/ufs/ufs/ufs_readwrite.c (revision 13489) +++ head/sys/ufs/ufs/ufs_readwrite.c (revision 13490) @@ -1,435 +1,438 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_readwrite.c 8.7 (Berkeley) 1/21/94 - * $Id: ufs_readwrite.c,v 1.18 1996/01/06 12:49:53 phk Exp $ + * $Id: ufs_readwrite.c,v 1.19 1996/01/07 09:42:36 phk Exp $ */ #ifdef LFS_READWRITE #define BLKSIZE(a, b, c) blksize(a) #define FS struct lfs #define I_FS i_lfs #define READ lfs_read #define READ_S "lfs_read" #define WRITE lfs_write #define WRITE_S "lfs_write" #define fs_bsize lfs_bsize #define fs_maxfilesize lfs_maxfilesize #else #define BLKSIZE(a, b, c) blksize(a, b, c) #define FS struct fs #define I_FS i_fs #define READ ffs_read #define READ_S "ffs_read" #define WRITE ffs_write #define WRITE_S "ffs_write" #include #include #include #endif /* * Vnode op for reading. */ /* ARGSUSED */ int READ(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct inode *ip; register struct uio *uio; register FS *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; u_short mode; vp = ap->a_vp; ip = VTOI(vp); mode = ip->i_mode; uio = ap->a_uio; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("%s: mode", READ_S); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("%s: short symlink", READ_S); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", READ_S, vp->v_type); #endif fs = ip->I_FS; if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize) return (EFBIG); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = BLKSIZE(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; #ifdef LFS_READWRITE (void)lfs_check(vp, lbn); error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); #else if (lblktosize(fs, nextlbn) > ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if (doclusterread) error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); else if (lbn - 1 == vp->v_lastr) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); #endif - if (error) + if (error) { + brelse(bp); + bp = NULL; break; + } vp->v_lastr = lbn; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; if (error) break; - brelse(bp); + bqrelse(bp); } if (bp != NULL) - brelse(bp); + bqrelse(bp); ip->i_flag |= IN_ACCESS; return (error); } /* * Vnode op for writing. */ int WRITE(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { register struct vnode *vp; register struct uio *uio; register struct inode *ip; register FS *fs; struct buf *bp; struct proc *p; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, xfersize; struct timeval tv; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("%s: mode", WRITE_S); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: if ((ioflag & IO_SYNC) == 0) panic("%s: nonsync dir write", WRITE_S); break; default: panic("%s: type", WRITE_S); } fs = ip->I_FS; if (uio->uio_offset < 0 || (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ p = uio->uio_procp; if (vp->v_type == VREG && p && uio->uio_offset + uio->uio_resid > p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { psignal(p, SIGXFSZ); return (EFBIG); } resid = uio->uio_resid; osize = ip->i_size; flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); #ifdef LFS_READWRITE (void)lfs_check(vp, lbn); error = lfs_balloc(vp, xfersize, lbn, &bp); #else if (fs->fs_bsize > xfersize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; error = ffs_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); #endif if (error) break; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; } size = BLKSIZE(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag |= IN_RECURSE; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (uio->uio_segflg != UIO_NOCOPY) ip->i_flag &= ~IN_RECURSE; #ifdef LFS_READWRITE (void)VOP_BWRITE(bp); #else if (ioflag & IO_VMIO) bp->b_flags |= B_RELBUF; if (ioflag & IO_SYNC) { (void)bwrite(bp); - } else if (xfersize + blkoffset == fs->fs_bsize && - (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { + } else if (xfersize + blkoffset == fs->fs_bsize) { if (doclusterwrite) { bp->b_flags |= B_CLUSTEROK; cluster_write(bp, ip->i_size); } else { bawrite(bp); } } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } #endif if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_mode &= ~(ISUID | ISGID); if (error) { if (ioflag & IO_UNIT) { (void)VOP_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { tv = time; error = VOP_UPDATE(vp, &tv, &tv, 1); } return (error); } #ifndef LFS_READWRITE /* * get page routine */ int ffs_getpages(ap) struct vop_getpages_args *ap; { off_t foff, physoffset; int i, size, bsize; struct vnode *dp; int bbackwards, bforwards; int pbackwards, pforwards; int firstpage; int reqlblkno; daddr_t reqblkno; int poff; int pcount; int rtval; int pagesperblock; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * if ANY DEV_BSIZE blocks are valid on a large filesystem block * then, the entire page is valid -- */ if (ap->a_m[ap->a_reqpage]->valid) { ap->a_m[ap->a_reqpage]->valid = VM_PAGE_BITS_ALL; for (i = 0; i < pcount; i++) { if (i != ap->a_reqpage) vnode_pager_freepage(ap->a_m[i]); } return VM_PAGER_OK; } bsize = ap->a_vp->v_mount->mnt_stat.f_iosize; /* * foff is the file offset of the required page * reqlblkno is the logical block that contains the page * poff is the index of the page into the logical block */ foff = IDX_TO_OFF(ap->a_m[ap->a_reqpage]->pindex) + ap->a_offset; reqlblkno = foff / bsize; poff = (foff % bsize) / PAGE_SIZE; if ( VOP_BMAP( ap->a_vp, reqlblkno, &dp, &reqblkno, &bforwards, &bbackwards) || (reqblkno == -1)) { for(i = 0; i < pcount; i++) { if (i != ap->a_reqpage) vnode_pager_freepage(ap->a_m[i]); } if (reqblkno == -1) { if ((ap->a_m[ap->a_reqpage]->flags & PG_ZERO) == 0) vm_page_zero_fill(ap->a_m[ap->a_reqpage]); ap->a_m[ap->a_reqpage]->dirty = 0; ap->a_m[ap->a_reqpage]->valid = VM_PAGE_BITS_ALL; return VM_PAGER_OK; } else { return VM_PAGER_ERROR; } } physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; pagesperblock = bsize / PAGE_SIZE; /* * find the first page that is contiguous... * note that pbackwards is the number of pages that are contiguous * backwards. */ firstpage = 0; if (ap->a_count) { pbackwards = poff + bbackwards * pagesperblock; if (ap->a_reqpage > pbackwards) { firstpage = ap->a_reqpage - pbackwards; for(i=0;ia_m[i]); } /* * pforwards is the number of pages that are contiguous * after the current page. */ pforwards = (pagesperblock - (poff + 1)) + bforwards * pagesperblock; if (pforwards < (pcount - (ap->a_reqpage + 1))) { for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) vnode_pager_freepage(ap->a_m[i]); pcount = ap->a_reqpage + pforwards + 1; } /* * number of pages for I/O corrected for the non-contig pages at * the beginning of the array. */ pcount -= firstpage; } /* * calculate the size of the transfer */ size = pcount * PAGE_SIZE; if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > ((vm_object_t) ap->a_vp->v_object)->un_pager.vnp.vnp_size) size = ((vm_object_t) ap->a_vp->v_object)->un_pager.vnp.vnp_size - IDX_TO_OFF(ap->a_m[firstpage]->pindex); physoffset -= IDX_TO_OFF(ap->a_m[ap->a_reqpage]->pindex); rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, (ap->a_reqpage - firstpage), physoffset); return (rtval); } #endif Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 13489) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 13490) @@ -1,2151 +1,2152 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.10 (Berkeley) 4/1/94 - * $Id: ufs_vnops.c,v 1.35 1995/12/11 04:57:49 dyson Exp $ + * $Id: ufs_vnops.c,v 1.36 1996/01/05 18:31:58 wollman Exp $ */ #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); static int ufs_chown __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); #ifdef EXT2FS #include #include #include #endif /* EXT2FS */ union _qcvt { quad_t qcvt; long val[2]; }; #define SETHIGH(q, h) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_HIGHWORD] = (h); \ (q) = tmp.qcvt; \ } #define SETLOW(q, l) { \ union _qcvt tmp; \ tmp.qcvt = (q); \ tmp.val[_QUAD_LOWWORD] = (l); \ (q) = tmp.qcvt; \ } /* * Create a regular file */ int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error) return (error); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ ip->i_rdev = vap->va_rdev; } /* * Remove inode so that it will be reloaded by VFS_VGET and * checked to see if it is an alias of an existing entry in * the inode cache. */ vput(*vpp); (*vpp)->v_type = VNON; vgone(*vpp); *vpp = 0; return (0); } /* * Open called. * * Nothing to do. */ /* ARGSUSED */ int ufs_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) ITIMES(ip, &time, &time); return (0); } int ufs_access(ap) struct vop_access_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; mode_t mask, mode = ap->a_mode; register gid_t *gp; int i, error; /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (mode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA if (error = getinoquota(ip)) return (error); #endif break; } } /* If immutable bit set, nobody gets to write it. */ if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) return (EPERM); /* Otherwise, user id 0 always gets access. */ if (cred->cr_uid == 0) return (0); mask = 0; /* Otherwise, check the owner. */ if (cred->cr_uid == ip->i_uid) { if (mode & VEXEC) mask |= S_IXUSR; if (mode & VREAD) mask |= S_IRUSR; if (mode & VWRITE) mask |= S_IWUSR; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check the groups. */ for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) if (ip->i_gid == *gp) { if (mode & VEXEC) mask |= S_IXGRP; if (mode & VREAD) mask |= S_IRGRP; if (mode & VWRITE) mask |= S_IWGRP; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* Otherwise, check everyone else. */ if (mode & VEXEC) mask |= S_IXOTH; if (mode & VREAD) mask |= S_IROTH; if (mode & VWRITE) mask |= S_IWOTH; return ((ip->i_mode & mask) == mask ? 0 : EACCES); } /* ARGSUSED */ int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); register struct vattr *vap = ap->a_vap; ITIMES(ip, &time, &time); /* * Copy from inode table */ vap->va_fsid = ip->i_dev; vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = (dev_t)ip->i_rdev; vap->va_size = ip->i_din.di_size; vap->va_atime = ip->i_atime; vap->va_mtime = ip->i_mtime; vap->va_ctime = ip->i_ctime; vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; /* this doesn't belong here */ if (vp->v_type == VBLK) vap->va_blocksize = BLKDEV_IOSIZE; else if (vp->v_type == VCHR) vap->va_blocksize = MAXBSIZE; else vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob(ip->i_blocks); vap->va_type = vp->v_type; vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct proc *a_p; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct proc *p = ap->a_p; struct timeval atimeval, mtimeval; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != ip->i_uid && (error = suser(cred, &p->p_acflag))) return (error); if (cred->cr_uid == 0) { if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) && securelevel > 0) return (EPERM); ip->i_flags = vap->va_flags; } else { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) return (EPERM); ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); } ip->i_flag |= IN_CHANGE; if (vap->va_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p); if (error) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; } error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p); if (error) return (error); } ip = VTOI(vp); if (vap->va_atime.ts_sec != VNOVAL || vap->va_mtime.ts_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (cred->cr_uid != ip->i_uid && (error = suser(cred, &p->p_acflag)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, p)))) return (error); if (vap->va_atime.ts_sec != VNOVAL) ip->i_flag |= IN_ACCESS; if (vap->va_mtime.ts_sec != VNOVAL) ip->i_flag |= IN_CHANGE | IN_UPDATE; atimeval.tv_sec = vap->va_atime.ts_sec; atimeval.tv_usec = vap->va_atime.ts_nsec / 1000; mtimeval.tv_sec = vap->va_mtime.ts_sec; mtimeval.tv_usec = vap->va_mtime.ts_nsec / 1000; error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ufs_chmod(vp, (int)vap->va_mode, cred, p); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, p) register struct vnode *vp; register int mode; register struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); int error; if (cred->cr_uid != ip->i_uid) { error = suser(cred, &p->p_acflag); if (error) return (error); } if (cred->cr_uid) { if (vp->v_type != VDIR && (mode & S_ISTXT)) return (EFTYPE); if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) return (EPERM); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, p) register struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct proc *p; { register struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA register int i; long change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * If we don't own the file, are trying to change the owner * of the file, or are not a member of the target group, * the caller must be superuser or the call fails. */ if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || (gid != ip->i_gid && !groupmember((gid_t)gid, cred))) && (error = suser(cred, &p->p_acflag))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if (error = getinoquota(ip)) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = ip->i_blocks; (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; ip->i_uid = uid; #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; ip->i_uid = ouid; if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("chown: lost quota"); #endif /* QUOTA */ if (ouid != uid || ogid != gid) ip->i_flag |= IN_CHANGE; if (ouid != uid && cred->cr_uid != 0) ip->i_mode &= ~ISUID; if (ogid != gid && cred->cr_uid != 0) ip->i_mode &= ~ISGID; return (0); } /* ARGSUSED */ int ufs_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; int a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { return (ENOTTY); } /* ARGSUSED */ int ufs_select(ap) struct vop_select_args /* { struct vnode *a_vp; int a_which; int a_fflags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { /* * We should really check to see if I/O is possible. */ return (1); } /* * Mmap a file * * NB Currently unsupported. */ /* ARGSUSED */ int ufs_mmap(ap) struct vop_mmap_args /* { struct vnode *a_vp; int a_fflags; struct ucred *a_cred; struct proc *a_p; } */ *ap; { return (EINVAL); } /* * Seek on a file * * Nothing to do, so just return. */ /* ARGSUSED */ int ufs_seek(ap) struct vop_seek_args /* { struct vnode *a_vp; off_t a_oldoff; off_t a_newoff; struct ucred *a_cred; } */ *ap; { return (0); } int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } #ifdef EXT2FS if (IS_EXT2_VNODE(dvp)) { error = ext2_dirremove(dvp, ap->a_cnp); } else { error = ufs_dirremove(dvp, ap->a_cnp); } #else error = ufs_dirremove(dvp, ap->a_cnp); #endif /* EXT2FS */ if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: if (dvp == vp) vrele(vp); else vput(vp); vput(dvp); return (error); } /* * link vnode call */ int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct timeval tv; int error; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (vp->v_mount != tdvp->v_mount) { VOP_ABORTOP(tdvp, cnp); error = EXDEV; goto out2; } if (vp != tdvp && (error = VOP_LOCK(vp))) { VOP_ABORTOP(tdvp, cnp); goto out2; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { VOP_ABORTOP(tdvp, cnp); error = EMLINK; goto out1; } if (ip->i_flags & (IMMUTABLE | APPEND)) { VOP_ABORTOP(tdvp, cnp); error = EPERM; goto out1; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; tv = time; error = VOP_UPDATE(vp, &tv, &tv, 1); if (!error) { #ifdef EXT2FS if (IS_EXT2_VNODE(tdvp)) { error = ext2_direnter(ip, tdvp, cnp); } else { error = ufs_direnter(ip, tdvp, cnp); } #else error = ufs_direnter(ip, tdvp, cnp); #endif /* EXT2FS */ } if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } FREE(cnp->cn_pnbuf, M_NAMEI); out1: if (vp != tdvp) VOP_UNLOCK(vp); out2: vput(tdvp); return (error); } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; register struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct inode *ip, *xp, *dp; struct dirtemplate dirbuf; struct timeval tv; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; #ifdef DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ vrele(fdvp); vrele(fvp); return (error); } /* * Check if just deleting a link name. */ if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } if (fvp == tvp) { if (fvp->v_type == VDIR) { error = EINVAL; goto abortit; } /* Release destination completely. */ VOP_ABORTOP(tdvp, tcnp); vput(tdvp); vput(tvp); /* Delete source. */ vrele(fdvp); vrele(fvp); fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); fcnp->cn_nameiop = DELETE; (void) relookup(fdvp, &fvp, fcnp); return (VOP_REMOVE(fdvp, fvp, fcnp)); } error = VOP_LOCK(fvp); if (error) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if ((ip->i_flags & (IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ip->i_nlink++; ip->i_flag |= IN_CHANGE; tv = time; error = VOP_UPDATE(fvp, &tv, &tv, 1); if (error) { VOP_UNLOCK(fvp); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory heirarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); VOP_UNLOCK(fvp); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); #ifdef EXT2FS if (IS_EXT2_VNODE(tdvp)) { error = ext2_checkpath(ip, dp, tcnp->cn_cred); } else { error = ufs_checkpath(ip, dp, tcnp->cn_cred); } #else error = ufs_checkpath(ip, dp, tcnp->cn_cred); #endif /* EXT2FS */ if (error) goto out; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_dev != ip->i_dev) panic("rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } dp->i_nlink++; dp->i_flag |= IN_CHANGE; error = VOP_UPDATE(tdvp, &tv, &tv, 1); if (error) goto bad; } #ifdef EXT2FS if (IS_EXT2_VNODE(tdvp)) { error = ext2_direnter(ip, tdvp, tcnp); } else { error = ufs_direnter(ip, tdvp, tcnp); } #else error = ufs_direnter(ip, tdvp, tcnp); #endif /* EXT2FS */ if (error) { if (doingdirectory && newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; (void)VOP_UPDATE(tdvp, &tv, &tv, 1); } goto bad; } vput(tdvp); } else { if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) panic("rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode&IFMT) == IFDIR) { #ifdef EXT2FS if (! (IS_EXT2_VNODE(ITOV(xp)) ? ext2_dirempty : ufs_dirempty) #else if (! ufs_dirempty #endif /* EXT2FS */ (xp, dp->i_number, tcnp->cn_cred) || xp->i_nlink > 2) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } #ifdef EXT2FS if (IS_EXT2_VNODE(ITOV(dp))) { error = ext2_dirrewrite(dp, ip, tcnp); } else { error = ufs_dirrewrite(dp, ip, tcnp); } #else error = ufs_dirrewrite(dp, ip, tcnp); #endif /* EXT2FS */ if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ xp->i_nlink--; if (doingdirectory) { if (--xp->i_nlink != 0) panic("rename: linked directory"); error = VOP_TRUNCATE(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_proc); } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; if ((fcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost from startdir"); (void) relookup(fdvp, &fvp, fcnp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. */ if (doingdirectory) panic("rename: lost dir entry"); vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { if (doingdirectory) panic("rename: lost dir entry"); } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED, tcnp->cn_cred, (int *)0, (struct proc *)0); if (error == 0) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (fvp->v_mount->mnt_maxsymlinklen <= 0) namlen = dirbuf.dotdot_type; else namlen = dirbuf.dotdot_namlen; # else namlen = dirbuf.dotdot_namlen; # endif #ifdef EXT2FS if(IS_EXT2_VNODE(fvp)) namlen = ((struct odirtemplate *) &dirbuf)->dotdot_namlen; #endif /* EXT2FS */ if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') { ufs_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf.dotdot_ino = newparent; (void) vn_rdwr(UIO_WRITE, fvp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_SYNC, tcnp->cn_cred, (int *)0, (struct proc *)0); cache_purge(fdvp); } } } #ifdef EXT2FS if (IS_EXT2_VNODE(fdvp)) { error = ext2_dirremove(fdvp, fcnp); } else { error = ufs_dirremove(fdvp, fcnp); } #else error = ufs_dirremove(fdvp, fcnp); #endif /* EXT2FS */ if (!error) { xp->i_nlink--; xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (VOP_LOCK(fvp) == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, { '.', 0 }, 0, DIRBLKSIZ - 12, DT_DIR, 2, { '.', '.', 0 } }; static struct odirtemplate omastertemplate = { 0, 12, 1, { '.', 0 }, 0, DIRBLKSIZ - 12, 2, { '.', '.', 0 } }; /* * Mkdir system call */ int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { register struct vnode *dvp = ap->a_dvp; register struct vattr *vap = ap->a_vap; register struct componentname *cnp = ap->a_cnp; register struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; struct timeval tv; int error, dmode; #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = VOP_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_uid = cnp->cn_cred->cr_uid; ip->i_gid = dp->i_gid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { free(cnp->cn_pnbuf, M_NAMEI); VOP_VFREE(tvp, ip->i_number, dmode); vput(tvp); vput(dvp); return (error); } #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; tv = time; error = VOP_UPDATE(tvp, &tv, &tv, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ dp->i_nlink++; dp->i_flag |= IN_CHANGE; error = VOP_UPDATE(dvp, &tv, &tv, 1); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0 #ifdef EXT2FS /* omastertemplate is want we want for EXT2 */ && !IS_EXT2_VNODE(dvp) #endif /* EXT2FS */ ) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; #ifdef EXT2FS /* note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE * so let's just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ (IS_EXT2_VNODE(dvp) ? \ VTOI(dvp)->i_e2fs->s_blocksize : DEV_BSIZE) if(IS_EXT2_VNODE(dvp)) dirtemplate.dotdot_reclen = DIRBLKSIZ - 12; #endif /* EXT2FS */ error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate, sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0); if (error) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */ else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } /* Directory set up, now install it's entry in the parent directory. */ #ifdef EXT2FS if (IS_EXT2_VNODE(dvp)) { error = ext2_direnter(ip, dvp, cnp); } else { error = ufs_direnter(ip, dvp, cnp); } #else error = ufs_direnter(ip, dvp, cnp); #endif /* EXT2FS */ if (error) { dp->i_nlink--; dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: FREE(cnp->cn_pnbuf, M_NAMEI); vput(dvp); return (error); #ifdef EXT2FS #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE #endif /* EXT2FS */ } /* * Rmdir system call. */ int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * No rmdir "." please. */ if (dp == ip) { vrele(dvp); vput(vp); return (EINVAL); } /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ error = 0; if (ip->i_nlink != 2 || #ifdef EXT2FS !(IS_EXT2_VNODE(ITOV(ip)) ? ext2_dirempty : ufs_dirempty) (ip, dp->i_number, cnp->cn_cred)) { #else !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { #endif /* EXT2FS */ error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ #ifdef EXT2FS if (IS_EXT2_VNODE(dvp)) { error = ext2_dirremove(dvp, cnp); } else { error = ufs_dirremove(dvp, cnp); } #else error = ufs_dirremove(dvp, cnp); #endif /* EXT2FS */ if (error) goto out; dp->i_nlink--; dp->i_flag |= IN_CHANGE; cache_purge(dvp); vput(dvp); dvp = NULL; /* * Truncate inode. The only stuff left * in the directory is "." and "..". The * "." reference is inconsequential since * we're quashing it. The ".." reference * has already been adjusted above. We've * removed the "." reference and the reference * in the parent directory, but there may be * other hard links so decrement by 2 and * worry about them later. */ ip->i_nlink -= 2; error = VOP_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred, cnp->cn_proc); cache_purge(ITOV(ip)); out: if (dvp) vput(dvp); vput(vp); return (error); } /* * symlink -- make a symbolic link */ int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; register struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, (struct proc *)0); vput(vp); return (error); } /* * Vnode op for reading directories. * * The routine below assumes that the on-disk format of a directory * is the same as that defined by . If the on-disk * format changes, then it will be necessary to do a conversion * from the on-disk format that read returns to the format defined * by . */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_ncookies; u_int **cookies; } */ *ap; { register struct uio *uio = ap->a_uio; off_t off; int count, lost, error; if (ap->a_ncookies != NULL) /* * Ensure that the block is aligned. The caller can use * the cookies to determine where in the block to start. */ uio->uio_offset &= ~(DIRBLKSIZ - 1); off = uio->uio_offset; count = uio->uio_resid; count &= ~(DIRBLKSIZ - 1); lost = uio->uio_resid - count; if (count < DIRBLKSIZ || (uio->uio_offset & (DIRBLKSIZ -1))) return (EINVAL); uio->uio_resid = count; uio->uio_iov->iov_len = count; # if (BYTE_ORDER == LITTLE_ENDIAN) if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); } else { struct dirent *dp, *edp; struct uio auio; struct iovec aiov; caddr_t dirbuf; int readcnt; u_char tmp; auio = *uio; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; aiov.iov_len = count; MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); aiov.iov_base = dirbuf; error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); if (error == 0) { readcnt = count - auio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { tmp = dp->d_namlen; dp->d_namlen = dp->d_type; dp->d_type = tmp; if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, uio); } FREE(dirbuf, M_TEMP); } # else error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); # endif if (!error && ap->a_ncookies != NULL) { struct dirent* dpStart; struct dirent* dpEnd; struct dirent* dp; int ncookies; u_int *cookies; u_int *cookiep; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) panic("ufs_readdir: unexpected uio from NFS server"); dpStart = (struct dirent *) (uio->uio_iov->iov_base - (uio->uio_offset - off)); dpEnd = (struct dirent *) uio->uio_iov->iov_base; for (dp = dpStart, ncookies = 0; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) ncookies++; MALLOC(cookies, u_int *, ncookies * sizeof(u_int), M_TEMP, M_WAITOK); for (dp = dpStart, cookiep = cookies; dp < dpEnd; dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { off += dp->d_reclen; *cookiep++ = (u_int) off; } *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } if (ap->a_eofflag) *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; uio->uio_resid += lost; return (error); } /* * Return target name of a symbolic link */ int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || (ip->i_din.di_blocks == 0)) { /* XXX - for old fastlink support */ uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually * done. If a buffer has been saved in anticipation of a CREATE, delete it. */ /* ARGSUSED */ int ufs_abortop(ap) struct vop_abortop_args /* { struct vnode *a_dvp; struct componentname *a_cnp; } */ *ap; { if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); return (0); } /* * Lock an inode. If its already locked, set the WANT bit and sleep. */ int ufs_lock(ap) struct vop_lock_args /* { struct vnode *a_vp; } */ *ap; { struct proc *p = curproc; register struct vnode *vp = ap->a_vp; register struct inode *ip; start: while (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; (void) tsleep((caddr_t)vp, PINOD, "ufslk1", 0); } if (vp->v_tag == VT_NON) return (ENOENT); ip = VTOI(vp); if (ip->i_flag & IN_LOCKED) { if (p->p_pid == ip->i_lockholder) { if( (ip->i_flag & IN_RECURSE) == 0) panic("ufs_lock: recursive lock not expected, pid: %d\n", ip->i_lockholder); } else { ip->i_flag |= IN_WANTED; #ifdef DIAGNOSTIC if (p) ip->i_lockwaiter = p->p_pid; else ip->i_lockwaiter = -1; #endif (void) tsleep((caddr_t)ip, PINOD, "ufslk2", 0); goto start; } } #ifdef DIAGNOSTIC ip->i_lockwaiter = 0; if (((ip->i_flag & IN_RECURSE) == 0) && (ip->i_lockholder != 0)) panic("lockholder (%d) != 0", ip->i_lockholder); if (p && p->p_pid == 0) printf("locking by process 0\n"); #endif if ((ip->i_flag & IN_RECURSE) == 0) ip->i_lockcount = 1; else ++ip->i_lockcount; if (p) ip->i_lockholder = p->p_pid; else ip->i_lockholder = -1; ip->i_flag |= IN_LOCKED; return (0); } /* * Unlock an inode. If WANT bit is on, wakeup. */ int lockcount = 90; int ufs_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); #ifdef DIAGNOSTIC struct proc *p = curproc; if ((ip->i_flag & IN_LOCKED) == 0) { vprint("ufs_unlock: unlocked inode", ap->a_vp); panic("ufs_unlock NOT LOCKED"); } if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 && ip->i_lockholder > -1 && lockcount++ < 100) panic("unlocker (%d) != lock holder (%d)", p->p_pid, ip->i_lockholder); #endif if (--ip->i_lockcount > 0) { if ((ip->i_flag & IN_RECURSE) == 0) panic("ufs_unlock: recursive lock prematurely released, pid=%d\n", ip->i_lockholder); return (0); } ip->i_lockholder = 0; ip->i_flag &= ~(IN_LOCKED|IN_RECURSE); if (ip->i_flag & IN_WANTED) { ip->i_flag &= ~IN_WANTED; wakeup((caddr_t)ip); } return (0); } /* * Check for a locked inode. */ int ufs_islocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { if (VTOI(ap->a_vp)->i_flag & IN_LOCKED) return (1); return (0); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ufs_strategy(ap) struct vop_strategy_args /* { struct buf *a_bp; } */ *ap; { register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; register struct inode *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); if (error) { bp->b_error = error; bp->b_flags |= B_ERROR; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); return (0); } /* * Print out the contents of an inode. */ int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { register struct vnode *vp = ap->a_vp; register struct inode *ip = VTOI(vp); printf("tag VT_UFS, ino %ld, on dev %d, %d", ip->i_number, major(ip->i_dev), minor(ip->i_dev)); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("%s\n", (ip->i_flag & IN_LOCKED) ? " (LOCKED)" : ""); if (ip->i_lockholder == 0) return (0); printf("\towner pid %lu", (u_long)ip->i_lockholder); if (ip->i_lockwaiter) printf(" waiting pid %lu", (u_long)ip->i_lockwaiter); printf("\n"); return (0); } /* * Read wrapper for special devices. */ int ufsspec_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { /* * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for special devices. */ int ufsspec_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { /* * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for special devices. * * Update the times on the inode then do device close. */ int ufsspec_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) ITIMES(ip, &time, &time); return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Read wrapper for fifo's */ int ufsfifo_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { /* * Set access flag. */ VTOI(ap->a_vp)->i_flag |= IN_ACCESS; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); } /* * Write wrapper for fifo's. */ int ufsfifo_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { /* * Set update and change flags. */ VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); } /* * Close wrapper for fifo's. * * Update the times on the inode then do device close. */ int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct proc *a_p; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) ITIMES(ip, &time, &time); return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Advisory record locking support */ int ufs_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { register struct inode *ip = VTOI(ap->a_vp); return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, specops, fifoops, vpp) struct mount *mntp; vop_t **specops; vop_t **fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp, *nvp; vp = *vpp; ip = VTOI(vp); switch(vp->v_type = IFTOVT(ip->i_mode)) { case VCHR: case VBLK: vp->v_op = specops; nvp = checkalias(vp, ip->i_rdev, mntp); if (nvp) { /* * Discard unneeded vnode, but save its inode. */ ufs_ihashrem(ip); VOP_UNLOCK(vp); nvp->v_data = vp->v_data; vp->v_data = NULL; vp->v_op = spec_vnodeop_p; vrele(vp); vgone(vp); /* * Reinitialize aliased inode. */ vp = nvp; ip->i_vnode = vp; ufs_ihashins(ip); } break; case VFIFO: vp->v_op = fifoops; break; default: break; } if (ip->i_number == ROOTINO) vp->v_flag |= VROOT; /* * Initialize modrev times */ SETHIGH(ip->i_modrev, mono_time.tv_sec); SETLOW(ip->i_modrev, mono_time.tv_usec * 4294); *vpp = vp; return (0); } /* * Allocate a new inode. */ int ufs_makeinode(mode, dvp, vpp, cnp) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { register struct inode *ip, *pdir; struct timeval tv; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = VOP_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) { free(cnp->cn_pnbuf, M_NAMEI); vput(dvp); return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; if ((mode & IFMT) == IFLNK) ip->i_uid = pdir->i_uid; else ip->i_uid = cnp->cn_cred->cr_uid; #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { free(cnp->cn_pnbuf, M_NAMEI); VOP_VFREE(tvp, ip->i_number, mode); vput(tvp); vput(dvp); return (error); } #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && suser(cnp->cn_cred, NULL)) ip->i_mode &= ~ISGID; /* * Make sure inode goes to disk before directory entry. */ tv = time; error = VOP_UPDATE(tvp, &tv, &tv, 1); if (error) goto bad; #ifdef EXT2FS if (IS_EXT2_VNODE(dvp)) { error = ext2_direnter(ip, dvp, cnp); } else { error = ufs_direnter(ip, dvp, cnp); } #else error = ufs_direnter(ip, dvp, cnp); #endif /* EXT2FS */ if (error) goto bad; + if ((cnp->cn_flags & SAVESTART) == 0) FREE(cnp->cn_pnbuf, M_NAMEI); vput(dvp); *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ free(cnp->cn_pnbuf, M_NAMEI); vput(dvp); ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } Index: head/sys/vm/default_pager.c =================================================================== --- head/sys/vm/default_pager.c (revision 13489) +++ head/sys/vm/default_pager.c (revision 13490) @@ -1,145 +1,145 @@ /* * Copyright (c) 1995, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by David Greenman. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: default_pager.c,v 1.4 1995/12/11 04:57:56 dyson Exp $ + * $Id: default_pager.c,v 1.5 1995/12/14 09:54:46 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include static vm_object_t default_pager_alloc __P((void *, vm_size_t, vm_prot_t, vm_ooffset_t)); static void default_pager_dealloc __P((vm_object_t)); static int default_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static int default_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); static boolean_t default_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); /* * pagerops for OBJT_DEFAULT - "default pager". */ struct pagerops defaultpagerops = { NULL, default_pager_alloc, default_pager_dealloc, default_pager_getpages, default_pager_putpages, default_pager_haspage, NULL }; /* * no_pager_alloc just returns an initialized object. */ static vm_object_t default_pager_alloc(handle, size, prot, offset) void *handle; register vm_size_t size; vm_prot_t prot; vm_ooffset_t offset; { if (handle != NULL) panic("default_pager_alloc: handle specified"); - return vm_object_allocate(OBJT_DEFAULT, offset + size); + return vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(offset) + size); } static void default_pager_dealloc(object) vm_object_t object; { /* * OBJT_DEFAULT objects have no special resources allocated to them. */ } /* * The default pager has no backing store, so we always return * failure. */ static int default_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { return VM_PAGER_FAIL; } static int default_pager_putpages(object, m, c, sync, rtvals) vm_object_t object; vm_page_t *m; int c; boolean_t sync; int *rtvals; { int i; /* * Try to convert the object type into a OBJT_SWAP. * If the swp structure allocation fails, convert it * back to OBJT_DEFAULT and return failure. Otherwise * pass this putpages to the swap pager. */ object->type = OBJT_SWAP; if (swap_pager_swp_alloc(object, M_KERNEL) != 0) { object->type = OBJT_DEFAULT; for (i = 0; i < c; i++) rtvals[i] = VM_PAGER_FAIL; return VM_PAGER_FAIL; } return swap_pager_putpages(object, m, c, sync, rtvals); } static boolean_t default_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { return FALSE; } Index: head/sys/vm/device_pager.c =================================================================== --- head/sys/vm/device_pager.c (revision 13489) +++ head/sys/vm/device_pager.c (revision 13490) @@ -1,297 +1,297 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 - * $Id: device_pager.c,v 1.18 1995/12/13 15:13:54 julian Exp $ + * $Id: device_pager.c,v 1.19 1995/12/14 09:54:49 phk Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void dev_pager_init __P((void)); static vm_object_t dev_pager_alloc __P((void *, vm_size_t, vm_prot_t, vm_ooffset_t)); static void dev_pager_dealloc __P((vm_object_t)); static int dev_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static int dev_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); static boolean_t dev_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); /* list of device pager objects */ static struct pagerlst dev_pager_object_list; /* list of available vm_page_t's */ static TAILQ_HEAD(, vm_page) dev_pager_fakelist; static vm_page_t dev_pager_getfake __P((vm_offset_t)); static void dev_pager_putfake __P((vm_page_t)); static int dev_pager_alloc_lock, dev_pager_alloc_lock_want; struct pagerops devicepagerops = { dev_pager_init, dev_pager_alloc, dev_pager_dealloc, dev_pager_getpages, dev_pager_putpages, dev_pager_haspage, NULL }; static void dev_pager_init() { TAILQ_INIT(&dev_pager_object_list); TAILQ_INIT(&dev_pager_fakelist); } static vm_object_t dev_pager_alloc(handle, size, prot, foff) void *handle; vm_size_t size; vm_prot_t prot; vm_ooffset_t foff; { dev_t dev; d_mmap_t *mapfunc; vm_object_t object; unsigned int npages, off; /* * Make sure this device can be mapped. */ dev = (dev_t) (u_long) handle; mapfunc = cdevsw[major(dev)]->d_mmap; if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop) { printf("obsolete map function %p\n", (void *)mapfunc); return (NULL); } /* * Offset should be page aligned. */ if (foff & (PAGE_SIZE - 1)) return (NULL); /* * Check that the specified range of the device allows the desired * protection. * * XXX assumes VM_PROT_* == PROT_* */ - npages = atop(round_page(size)); + npages = size; for (off = foff; npages--; off += PAGE_SIZE) if ((*mapfunc) (dev, off, (int) prot) == -1) return (NULL); /* * Lock to prevent object creation race contion. */ while (dev_pager_alloc_lock) { dev_pager_alloc_lock_want++; tsleep(&dev_pager_alloc_lock, PVM, "dvpall", 0); dev_pager_alloc_lock_want--; } dev_pager_alloc_lock = 1; /* * Look up pager, creating as necessary. */ object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ object = vm_object_allocate(OBJT_DEVICE, - OFF_TO_IDX(foff + size)); + OFF_TO_IDX(foff) + size); object->handle = handle; TAILQ_INIT(&object->un_pager.devp.devp_pglist); TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); } else { /* * Gain a reference to the object. */ vm_object_reference(object); - if (OFF_TO_IDX(foff + size) > object->size) - object->size = OFF_TO_IDX(foff + size); + if (OFF_TO_IDX(foff) + size > object->size) + object->size = OFF_TO_IDX(foff) + size; } dev_pager_alloc_lock = 0; if (dev_pager_alloc_lock_want) wakeup(&dev_pager_alloc_lock); return (object); } static void dev_pager_dealloc(object) vm_object_t object; { vm_page_t m; TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list); /* * Free up our fake pages. */ while ((m = object->un_pager.devp.devp_pglist.tqh_first) != 0) { TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq); dev_pager_putfake(m); } } static int dev_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { vm_offset_t offset; vm_offset_t paddr; vm_page_t page; dev_t dev; int i, s; d_mmap_t *mapfunc; int prot; dev = (dev_t) (u_long) object->handle; offset = m[reqpage]->pindex + OFF_TO_IDX(object->paging_offset); prot = PROT_READ; /* XXX should pass in? */ mapfunc = cdevsw[major(dev)]->d_mmap; if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop) panic("dev_pager_getpage: no map function"); paddr = pmap_phys_address((*mapfunc) ((dev_t) dev, (int) offset << PAGE_SHIFT, prot)); #ifdef DIAGNOSTIC if (paddr == -1) panic("dev_pager_getpage: map function returns error"); #endif /* * Replace the passed in reqpage page with our own fake page and free up the * all of the original pages. */ page = dev_pager_getfake(paddr); TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist, page, pageq); for (i = 0; i < count; i++) { PAGE_WAKEUP(m[i]); vm_page_free(m[i]); } s = splhigh(); vm_page_insert(page, object, offset); splx(s); return (VM_PAGER_OK); } static int dev_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { panic("dev_pager_putpage called"); } static boolean_t dev_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { if (before != NULL) *before = 0; if (after != NULL) *after = 0; return (TRUE); } static vm_page_t dev_pager_getfake(paddr) vm_offset_t paddr; { vm_page_t m; int i; if (dev_pager_fakelist.tqh_first == NULL) { m = (vm_page_t) malloc(PAGE_SIZE * 2, M_VMPGDATA, M_WAITOK); for (i = (PAGE_SIZE * 2) / sizeof(*m); i > 0; i--) { TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); m++; } } m = dev_pager_fakelist.tqh_first; TAILQ_REMOVE(&dev_pager_fakelist, m, pageq); m->flags = PG_BUSY | PG_FICTITIOUS; m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; m->busy = 0; - m->bmapped = 0; + m->queue = PQ_NONE; m->wire_count = 1; m->phys_addr = paddr; return (m); } static void dev_pager_putfake(m) vm_page_t m; { if (!(m->flags & PG_FICTITIOUS)) panic("dev_pager_putfake: bad page"); TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); } Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 13489) +++ head/sys/vm/swap_pager.c (revision 13490) @@ -1,1620 +1,1630 @@ /* * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 - * $Id: swap_pager.c,v 1.57 1995/12/14 09:54:52 phk Exp $ + * $Id: swap_pager.c,v 1.58 1995/12/17 07:19:55 bde Exp $ */ /* * Quick hack to page to dedicated partition(s). * TODO: * Add multiprocessor locks * Deal with async writes in a better fashion */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NPENDINGIO #define NPENDINGIO 10 #endif static int nswiodone; int swap_pager_full; extern int vm_swap_size; static int no_swap_space = 1; struct rlist *swaplist; #define MAX_PAGEOUT_CLUSTER 16 TAILQ_HEAD(swpclean, swpagerclean); typedef struct swpagerclean *swp_clean_t; static struct swpagerclean { TAILQ_ENTRY(swpagerclean) spc_list; int spc_flags; struct buf *spc_bp; vm_object_t spc_object; vm_offset_t spc_kva; int spc_count; vm_page_t spc_m[MAX_PAGEOUT_CLUSTER]; } swcleanlist[NPENDINGIO]; /* spc_flags values */ #define SPC_ERROR 0x01 #define SWB_EMPTY (-1) /* list of completed page cleans */ static struct swpclean swap_pager_done; /* list of pending page cleans */ static struct swpclean swap_pager_inuse; /* list of free pager clean structs */ static struct swpclean swap_pager_free; /* list of "named" anon region objects */ static struct pagerlst swap_pager_object_list; /* list of "unnamed" anon region objects */ struct pagerlst swap_pager_un_object_list; #define SWAP_FREE_NEEDED 0x1 /* need a swap block */ #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2 static int swap_pager_needflags; static struct pagerlst *swp_qs[] = { &swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0 }; /* * pagerops for OBJT_SWAP - "swap pager". */ static vm_object_t swap_pager_alloc __P((void *handle, vm_size_t size, vm_prot_t prot, vm_ooffset_t offset)); static void swap_pager_dealloc __P((vm_object_t object)); static boolean_t swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex, int *before, int *after)); static int swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static void swap_pager_init __P((void)); static void swap_pager_sync __P((void)); struct pagerops swappagerops = { swap_pager_init, swap_pager_alloc, swap_pager_dealloc, swap_pager_getpages, swap_pager_putpages, swap_pager_haspage, swap_pager_sync }; static int npendingio = NPENDINGIO; static int dmmin; int dmmax; static __pure int swap_pager_block_index __P((vm_pindex_t pindex)) __pure2; static __pure int swap_pager_block_offset __P((vm_pindex_t pindex)) __pure2; static daddr_t *swap_pager_diskaddr __P((vm_object_t object, vm_pindex_t pindex, int *valid)); static void swap_pager_finish __P((swp_clean_t spc)); static void swap_pager_freepage __P((vm_page_t m)); static void swap_pager_free_swap __P((vm_object_t object)); static void swap_pager_freeswapspace __P((vm_object_t object, unsigned int from, unsigned int to)); static int swap_pager_getswapspace __P((vm_object_t object, unsigned int amount, daddr_t *rtval)); static void swap_pager_iodone __P((struct buf *)); static void swap_pager_iodone1 __P((struct buf *bp)); static void swap_pager_reclaim __P((void)); static void swap_pager_ridpages __P((vm_page_t *m, int count, int reqpage)); static void swap_pager_setvalid __P((vm_object_t object, vm_offset_t offset, int valid)); static void swapsizecheck __P((void)); static inline void swapsizecheck() { if (vm_swap_size < 128 * btodb(PAGE_SIZE)) { if (swap_pager_full == 0) printf("swap_pager: out of space\n"); swap_pager_full = 1; } else if (vm_swap_size > 192 * btodb(PAGE_SIZE)) swap_pager_full = 0; } static void swap_pager_init() { TAILQ_INIT(&swap_pager_object_list); TAILQ_INIT(&swap_pager_un_object_list); /* * Initialize clean lists */ TAILQ_INIT(&swap_pager_inuse); TAILQ_INIT(&swap_pager_done); TAILQ_INIT(&swap_pager_free); /* * Calculate the swap allocation constants. */ dmmin = CLBYTES / DEV_BSIZE; dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2; } void swap_pager_swap_init() { swp_clean_t spc; struct buf *bp; int i; /* * kva's are allocated here so that we dont need to keep doing * kmem_alloc pageables at runtime */ for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) { spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * MAX_PAGEOUT_CLUSTER); if (!spc->spc_kva) { break; } spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL); if (!spc->spc_bp) { kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE); break; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); } } int swap_pager_swp_alloc(object, wait) vm_object_t object; int wait; { sw_blk_t swb; int nblocks; int i, j; nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES; swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait); if (swb == NULL) return 1; for (i = 0; i < nblocks; i++) { swb[i].swb_valid = 0; swb[i].swb_locked = 0; for (j = 0; j < SWB_NPAGES; j++) swb[i].swb_block[j] = SWB_EMPTY; } object->un_pager.swp.swp_nblocks = nblocks; object->un_pager.swp.swp_allocsize = 0; object->un_pager.swp.swp_blocks = swb; object->un_pager.swp.swp_poip = 0; if (object->handle != NULL) { TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list); } else { TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); } return 0; } /* * Allocate an object and associated resources. * Note that if we are called from the pageout daemon (handle == NULL) * we should not wait for memory as it could resulting in deadlock. */ static vm_object_t swap_pager_alloc(handle, size, prot, offset) void *handle; register vm_size_t size; vm_prot_t prot; vm_ooffset_t offset; { vm_object_t object; /* * If this is a "named" anonymous region, look it up and use the * object if it exists, otherwise allocate a new one. */ if (handle) { object = vm_pager_object_lookup(&swap_pager_object_list, handle); if (object != NULL) { vm_object_reference(object); } else { /* * XXX - there is a race condition here. Two processes * can request the same named object simultaneuously, * and if one blocks for memory, the result is a disaster. * Probably quite rare, but is yet another reason to just * rip support of "named anonymous regions" out altogether. */ object = vm_object_allocate(OBJT_SWAP, - OFF_TO_IDX(offset+ PAGE_SIZE - 1 + size)); + OFF_TO_IDX(offset + PAGE_SIZE - 1) + size); object->handle = handle; (void) swap_pager_swp_alloc(object, M_WAITOK); } } else { object = vm_object_allocate(OBJT_SWAP, - OFF_TO_IDX(offset + PAGE_SIZE - 1 + size)); + OFF_TO_IDX(offset + PAGE_SIZE - 1) + size); (void) swap_pager_swp_alloc(object, M_WAITOK); } return (object); } /* * returns disk block associated with pager and offset * additionally, as a side effect returns a flag indicating * if the block has been written */ inline static daddr_t * swap_pager_diskaddr(object, pindex, valid) vm_object_t object; vm_pindex_t pindex; int *valid; { register sw_blk_t swb; int ix; if (valid) *valid = 0; ix = pindex / SWB_NPAGES; if ((ix >= object->un_pager.swp.swp_nblocks) || (pindex >= object->size)) { return (FALSE); } swb = &object->un_pager.swp.swp_blocks[ix]; ix = pindex % SWB_NPAGES; if (valid) *valid = swb->swb_valid & (1 << ix); return &swb->swb_block[ix]; } /* * Utility routine to set the valid (written) bit for * a block associated with a pager and offset */ static void swap_pager_setvalid(object, offset, valid) vm_object_t object; vm_offset_t offset; int valid; { register sw_blk_t swb; int ix; ix = offset / SWB_NPAGES; if (ix >= object->un_pager.swp.swp_nblocks) return; swb = &object->un_pager.swp.swp_blocks[ix]; ix = offset % SWB_NPAGES; if (valid) swb->swb_valid |= (1 << ix); else swb->swb_valid &= ~(1 << ix); return; } /* * this routine allocates swap space with a fragmentation * minimization policy. */ static int swap_pager_getswapspace(object, amount, rtval) vm_object_t object; unsigned int amount; daddr_t *rtval; { unsigned location; vm_swap_size -= amount; if (!rlist_alloc(&swaplist, amount, &location)) { vm_swap_size += amount; return 0; } else { swapsizecheck(); object->un_pager.swp.swp_allocsize += amount; *rtval = location; return 1; } } /* * this routine frees swap space with a fragmentation * minimization policy. */ static void swap_pager_freeswapspace(object, from, to) vm_object_t object; unsigned int from; unsigned int to; { rlist_free(&swaplist, from, to); vm_swap_size += (to - from) + 1; object->un_pager.swp.swp_allocsize -= (to - from) + 1; swapsizecheck(); } /* * this routine frees swap blocks from a specified pager */ void swap_pager_freespace(object, start, size) vm_object_t object; vm_pindex_t start; vm_size_t size; { vm_pindex_t i; int s; s = splbio(); for (i = start; i < start + size; i += 1) { int valid; daddr_t *addr = swap_pager_diskaddr(object, i, &valid); if (addr && *addr != SWB_EMPTY) { swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1); if (valid) { swap_pager_setvalid(object, i, 0); } *addr = SWB_EMPTY; } } splx(s); } static void swap_pager_free_swap(object) vm_object_t object; { register int i, j; register sw_blk_t swb; int first_block=0, block_count=0; int s; /* * Free left over swap blocks */ s = splbio(); for (i = 0, swb = object->un_pager.swp.swp_blocks; i < object->un_pager.swp.swp_nblocks; i++, swb++) { for (j = 0; j < SWB_NPAGES; j++) { if (swb->swb_block[j] != SWB_EMPTY) { /* * initially the length of the run is zero */ if (block_count == 0) { first_block = swb->swb_block[j]; block_count = btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; /* * if the new block can be included into the current run */ } else if (swb->swb_block[j] == first_block + block_count) { block_count += btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; /* * terminate the previous run, and start a new one */ } else { swap_pager_freeswapspace(object, first_block, (unsigned) first_block + block_count - 1); first_block = swb->swb_block[j]; block_count = btodb(PAGE_SIZE); swb->swb_block[j] = SWB_EMPTY; } } } } if (block_count) { swap_pager_freeswapspace(object, first_block, (unsigned) first_block + block_count - 1); } splx(s); } /* * swap_pager_reclaim frees up over-allocated space from all pagers * this eliminates internal fragmentation due to allocation of space * for segments that are never swapped to. It has been written so that * it does not block until the rlist_free operation occurs; it keeps * the queues consistant. */ /* * Maximum number of blocks (pages) to reclaim per pass */ #define MAXRECLAIM 128 static void swap_pager_reclaim() { vm_object_t object; int i, j, k; int s; int reclaimcount; static struct { int address; vm_object_t object; } reclaims[MAXRECLAIM]; static int in_reclaim; /* * allow only one process to be in the swap_pager_reclaim subroutine */ s = splbio(); if (in_reclaim) { tsleep(&in_reclaim, PSWP, "swrclm", 0); splx(s); return; } in_reclaim = 1; reclaimcount = 0; /* for each pager queue */ for (k = 0; swp_qs[k]; k++) { object = swp_qs[k]->tqh_first; while (object && (reclaimcount < MAXRECLAIM)) { /* * see if any blocks associated with a pager has been * allocated but not used (written) */ if (object->paging_in_progress == 0) { for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) { sw_blk_t swb = &object->un_pager.swp.swp_blocks[i]; if (swb->swb_locked) continue; for (j = 0; j < SWB_NPAGES; j++) { if (swb->swb_block[j] != SWB_EMPTY && (swb->swb_valid & (1 << j)) == 0) { reclaims[reclaimcount].address = swb->swb_block[j]; reclaims[reclaimcount++].object = object; swb->swb_block[j] = SWB_EMPTY; if (reclaimcount >= MAXRECLAIM) goto rfinished; } } } } object = object->pager_object_list.tqe_next; } } rfinished: /* * free the blocks that have been added to the reclaim list */ for (i = 0; i < reclaimcount; i++) { swap_pager_freeswapspace(reclaims[i].object, reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1); } splx(s); in_reclaim = 0; wakeup(&in_reclaim); } /* * swap_pager_copy copies blocks from one pager to another and * destroys the source pager */ void swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset, offset) vm_object_t srcobject; vm_pindex_t srcoffset; vm_object_t dstobject; vm_pindex_t dstoffset; vm_pindex_t offset; { vm_pindex_t i; int origsize; int s; if (vm_swap_size) no_swap_space = 0; origsize = srcobject->un_pager.swp.swp_allocsize; /* * remove the source object from the swap_pager internal queue */ if (srcobject->handle == NULL) { TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list); } else { TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list); } s = splbio(); while (srcobject->un_pager.swp.swp_poip) { tsleep(srcobject, PVM, "spgout", 0); } splx(s); /* * clean all of the pages that are currently active and finished */ swap_pager_sync(); s = splbio(); /* * transfer source to destination */ for (i = 0; i < dstobject->size; i += 1) { int srcvalid, dstvalid; daddr_t *srcaddrp = swap_pager_diskaddr(srcobject, i + offset + srcoffset, &srcvalid); daddr_t *dstaddrp; /* * see if the source has space allocated */ if (srcaddrp && *srcaddrp != SWB_EMPTY) { /* * if the source is valid and the dest has no space, * then copy the allocation from the srouce to the * dest. */ if (srcvalid) { dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset, &dstvalid); /* * if the dest already has a valid block, * deallocate the source block without * copying. */ if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) { swap_pager_freeswapspace(dstobject, *dstaddrp, *dstaddrp + btodb(PAGE_SIZE) - 1); *dstaddrp = SWB_EMPTY; } if (dstaddrp && *dstaddrp == SWB_EMPTY) { *dstaddrp = *srcaddrp; *srcaddrp = SWB_EMPTY; dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE); srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE); swap_pager_setvalid(dstobject, i + dstoffset, 1); } } /* * if the source is not empty at this point, then * deallocate the space. */ if (*srcaddrp != SWB_EMPTY) { swap_pager_freeswapspace(srcobject, *srcaddrp, *srcaddrp + btodb(PAGE_SIZE) - 1); *srcaddrp = SWB_EMPTY; } } } splx(s); /* * Free left over swap blocks */ swap_pager_free_swap(srcobject); if (srcobject->un_pager.swp.swp_allocsize) { printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n", srcobject->un_pager.swp.swp_allocsize, origsize); } free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA); srcobject->un_pager.swp.swp_blocks = NULL; return; } static void swap_pager_dealloc(object) vm_object_t object; { int s; /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if (object->handle == NULL) { TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list); } else { TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list); } /* * Wait for all pageouts to finish and remove all entries from * cleaning list. */ s = splbio(); while (object->un_pager.swp.swp_poip) { tsleep(object, PVM, "swpout", 0); } splx(s); swap_pager_sync(); /* * Free left over swap blocks */ swap_pager_free_swap(object); if (object->un_pager.swp.swp_allocsize) { printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n", object->un_pager.swp.swp_allocsize); } /* * Free swap management resources */ free(object->un_pager.swp.swp_blocks, M_VMPGDATA); object->un_pager.swp.swp_blocks = NULL; } static inline __pure int swap_pager_block_index(pindex) vm_pindex_t pindex; { return (pindex / SWB_NPAGES); } static inline __pure int swap_pager_block_offset(pindex) vm_pindex_t pindex; { return (pindex % SWB_NPAGES); } /* * swap_pager_haspage returns TRUE if the pager has data that has * been written out. */ static boolean_t swap_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { register sw_blk_t swb; int ix; if (before != NULL) *before = 0; if (after != NULL) *after = 0; ix = pindex / SWB_NPAGES; if (ix >= object->un_pager.swp.swp_nblocks) { return (FALSE); } swb = &object->un_pager.swp.swp_blocks[ix]; ix = pindex % SWB_NPAGES; if (swb->swb_block[ix] != SWB_EMPTY) { if (swb->swb_valid & (1 << ix)) { int tix; if (before) { for(tix = ix - 1; tix >= 0; --tix) { if ((swb->swb_valid & (1 << tix)) == 0) break; if ((swb->swb_block[tix] + (ix - tix) * (PAGE_SIZE/DEV_BSIZE)) != swb->swb_block[ix]) break; (*before)++; } } if (after) { for(tix = ix + 1; tix < SWB_NPAGES; tix++) { if ((swb->swb_valid & (1 << tix)) == 0) break; if ((swb->swb_block[tix] - (tix - ix) * (PAGE_SIZE/DEV_BSIZE)) != swb->swb_block[ix]) break; (*after)++; } } return TRUE; } } return (FALSE); } /* * swap_pager_freepage is a convienience routine that clears the busy * bit and deallocates a page. */ static void swap_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * swap_pager_ridpages is a convienience routine that deallocates all * but the required page. this is usually used in error returns that * need to invalidate the "extra" readahead pages. */ static void swap_pager_ridpages(m, count, reqpage) vm_page_t *m; int count; int reqpage; { int i; for (i = 0; i < count; i++) if (i != reqpage) swap_pager_freepage(m[i]); } /* * swap_pager_iodone1 is the completion routine for both reads and async writes */ static void swap_pager_iodone1(bp) struct buf *bp; { bp->b_flags |= B_DONE; bp->b_flags &= ~B_ASYNC; wakeup(bp); } static int swap_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count, reqpage; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i; boolean_t rv; vm_offset_t kva, off[count]; swp_clean_t spc; vm_pindex_t paging_offset; int reqaddr[count]; int sequential; int first, last; int failed; int reqdskregion; object = m[reqpage]->object; paging_offset = OFF_TO_IDX(object->paging_offset); sequential = (m[reqpage]->pindex == (object->last_read + 1)); for (i = 0; i < count; i++) { vm_pindex_t fidx = m[i]->pindex + paging_offset; int ix = swap_pager_block_index(fidx); if (ix >= object->un_pager.swp.swp_nblocks) { int j; if (i <= reqpage) { swap_pager_ridpages(m, count, reqpage); return (VM_PAGER_FAIL); } for (j = i; j < count; j++) { swap_pager_freepage(m[j]); } count = i; break; } swb[i] = &object->un_pager.swp.swp_blocks[ix]; off[i] = swap_pager_block_offset(fidx); reqaddr[i] = swb[i]->swb_block[off[i]]; } /* make sure that our required input request is existant */ if (reqaddr[reqpage] == SWB_EMPTY || (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) { swap_pager_ridpages(m, count, reqpage); return (VM_PAGER_FAIL); } reqdskregion = reqaddr[reqpage] / dmmax; /* * search backwards for the first contiguous page to transfer */ failed = 0; first = 0; for (i = reqpage - 1; i >= 0; --i) { if (sequential || failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (first == 0) first = i + 1; } } /* * search forwards for the last contiguous page to transfer */ failed = 0; last = count; for (i = reqpage + 1; i < count; i++) { if (failed || (reqaddr[i] == SWB_EMPTY) || (swb[i]->swb_valid & (1 << off[i])) == 0 || (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) || ((reqaddr[i] / dmmax) != reqdskregion)) { failed = 1; swap_pager_freepage(m[i]); if (last == count) last = i; } } count = last; if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; reqaddr[i - first] = reqaddr[i]; off[i - first] = off[i]; } count -= first; reqpage -= first; } ++swb[reqpage]->swb_locked; /* * at this point: "m" is a pointer to the array of vm_page_t for * paging I/O "count" is the number of vm_page_t entries represented * by "m" "object" is the vm_object_t for I/O "reqpage" is the index * into "m" for the page actually faulted */ spc = NULL; /* we might not use an spc data structure */ if ((count == 1) && (swap_pager_free.tqh_first != NULL)) { spc = swap_pager_free.tqh_first; TAILQ_REMOVE(&swap_pager_free, spc, spc_list); kva = spc->spc_kva; bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; bp->b_vnbufs.le_next = NOLIST; } else { /* * Get a swap buffer header to perform the IO */ bp = getpbuf(); kva = (vm_offset_t) bp->b_data; } /* * map our page(s) into kva for input */ pmap_qenter(kva, m, count); bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING; bp->b_iodone = swap_pager_iodone1; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; crhold(bp->b_rcred); crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva; bp->b_blkno = reqaddr[0]; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; pbgetvp(swapdev_vp, bp); cnt.v_swapin++; cnt.v_swappgsin += count; /* * perform the I/O */ VOP_STRATEGY(bp); /* * wait for the sync I/O to complete */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "swread", 0); } if (bp->b_flags & B_ERROR) { printf("swap_pager: I/O error - pagein failed; blkno %d, size %d, error %d\n", bp->b_blkno, bp->b_bcount, bp->b_error); rv = VM_PAGER_ERROR; } else { rv = VM_PAGER_OK; } /* * relpbuf does this, but we maintain our own buffer list also... */ if (bp->b_vp) pbrelvp(bp); splx(s); swb[reqpage]->swb_locked--; /* * remove the mapping for kernel virtual */ pmap_qremove(kva, count); if (spc) { m[reqpage]->object->last_read = m[reqpage]->pindex; if (bp->b_flags & B_WANTED) wakeup(bp); /* * if we have used an spc, we need to free it. */ if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); } else { /* * release the physical I/O buffer */ relpbuf(bp); /* * finish up input if everything is ok */ if (rv == VM_PAGER_OK) { for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; m[i]->flags &= ~PG_ZERO; if (i != reqpage) { /* * whether or not to leave the page * activated is up in the air, but we * should put the page on a page queue * somewhere. (it already is in the * object). After some emperical * results, it is best to deactivate * the readahead pages. */ vm_page_deactivate(m[i]); /* * just in case someone was asking for * this page we now tell them that it * is ok to use */ m[i]->valid = VM_PAGE_BITS_ALL; PAGE_WAKEUP(m[i]); } } m[reqpage]->object->last_read = m[count-1]->pindex; /* * If we're out of swap space, then attempt to free * some whenever pages are brought in. We must clear * the clean flag so that the page contents will be * preserved. */ if (swap_pager_full) { for (i = 0; i < count; i++) { m[i]->dirty = VM_PAGE_BITS_ALL; } swap_pager_freespace(object, m[0]->pindex + paging_offset, count); } } else { swap_pager_ridpages(m, count, reqpage); } } if (rv == VM_PAGER_OK) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[reqpage])); m[reqpage]->valid = VM_PAGE_BITS_ALL; m[reqpage]->dirty = 0; } return (rv); } int swap_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { register struct buf *bp; sw_blk_t swb[count]; register int s; int i, j, ix; boolean_t rv; vm_offset_t kva, off, fidx; swp_clean_t spc; vm_pindex_t paging_pindex; int reqaddr[count]; int failed; if (vm_swap_size) no_swap_space = 0; if (no_swap_space) { for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_FAIL; return VM_PAGER_FAIL; } spc = NULL; object = m[0]->object; paging_pindex = OFF_TO_IDX(object->paging_offset); failed = 0; for (j = 0; j < count; j++) { fidx = m[j]->pindex + paging_pindex; ix = swap_pager_block_index(fidx); swb[j] = 0; if (ix >= object->un_pager.swp.swp_nblocks) { rtvals[j] = VM_PAGER_FAIL; failed = 1; continue; } else { rtvals[j] = VM_PAGER_OK; } swb[j] = &object->un_pager.swp.swp_blocks[ix]; swb[j]->swb_locked++; if (failed) { rtvals[j] = VM_PAGER_FAIL; continue; } off = swap_pager_block_offset(fidx); reqaddr[j] = swb[j]->swb_block[off]; if (reqaddr[j] == SWB_EMPTY) { daddr_t blk; int tries; int ntoget; tries = 0; s = splbio(); /* * if any other pages have been allocated in this * block, we only try to get one page. */ for (i = 0; i < SWB_NPAGES; i++) { if (swb[j]->swb_block[i] != SWB_EMPTY) break; } ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1; /* * this code is alittle conservative, but works (the * intent of this code is to allocate small chunks for * small objects) */ if ((off == 0) && ((fidx + ntoget) > object->size)) { ntoget = object->size - fidx; } retrygetspace: if (!swap_pager_full && ntoget > 1 && swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE), &blk)) { for (i = 0; i < ntoget; i++) { swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i; swb[j]->swb_valid = 0; } reqaddr[j] = swb[j]->swb_block[off]; } else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE), &swb[j]->swb_block[off])) { /* * if the allocation has failed, we try to * reclaim space and retry. */ if (++tries == 1) { swap_pager_reclaim(); goto retrygetspace; } rtvals[j] = VM_PAGER_AGAIN; failed = 1; swap_pager_full = 1; } else { reqaddr[j] = swb[j]->swb_block[off]; swb[j]->swb_valid &= ~(1 << off); } splx(s); } } /* * search forwards for the last contiguous page to transfer */ failed = 0; for (i = 0; i < count; i++) { if (failed || (reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) || ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) || (rtvals[i] != VM_PAGER_OK)) { failed = 1; if (rtvals[i] == VM_PAGER_OK) rtvals[i] = VM_PAGER_AGAIN; } } for (i = 0; i < count; i++) { if (rtvals[i] != VM_PAGER_OK) { if (swb[i]) --swb[i]->swb_locked; } } for (i = 0; i < count; i++) if (rtvals[i] != VM_PAGER_OK) break; if (i == 0) { return VM_PAGER_AGAIN; } count = i; for (i = 0; i < count; i++) { if (reqaddr[i] == SWB_EMPTY) { printf("I/O to empty block???? -- pindex: %d, i: %d\n", m[i]->pindex, i); } } /* * For synchronous writes, we clean up all completed async pageouts. */ if (sync == TRUE) { swap_pager_sync(); } kva = 0; /* * get a swap pager clean data structure, block until we get it */ if (swap_pager_free.tqh_first == NULL || swap_pager_free.tqh_first->spc_list.tqe_next == NULL || swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { s = splbio(); if (curproc == pageproc) { +retryfree: /* * pageout daemon needs a swap control block */ swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT|SWAP_FREE_NEEDED; /* * if it does not get one within a short time, then * there is a potential deadlock, so we go-on trying - * to free pages. + * to free pages. It is important to block here as opposed + * to returning, thereby allowing the pageout daemon to continue. + * It is likely that pageout daemon will start suboptimally + * reclaiming vnode backed pages if we don't block. Since the + * I/O subsystem is probably already fully utilized, might as + * well wait. */ - tsleep(&swap_pager_free, PVM, "swpfre", hz/10); - swap_pager_sync(); - if (swap_pager_free.tqh_first == NULL || - swap_pager_free.tqh_first->spc_list.tqe_next == NULL || - swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { - splx(s); - return VM_PAGER_AGAIN; + if (tsleep(&swap_pager_free, PVM, "swpfre", hz/5)) { + swap_pager_sync(); + if (swap_pager_free.tqh_first == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { + splx(s); + return VM_PAGER_AGAIN; + } + } else { + /* + * we make sure that pageouts aren't taking up all of + * the free swap control blocks. + */ + swap_pager_sync(); + if (swap_pager_free.tqh_first == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { + goto retryfree; + } } - } else + } else { pagedaemon_wakeup(); - while (swap_pager_free.tqh_first == NULL || - swap_pager_free.tqh_first->spc_list.tqe_next == NULL || - swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { - if (curproc == pageproc) { - swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT; - if((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_reserved) - wakeup(&cnt.v_free_count); - } - - swap_pager_needflags |= SWAP_FREE_NEEDED; - tsleep(&swap_pager_free, PVM, "swpfre", 0); - if (curproc == pageproc) - swap_pager_sync(); - else + while (swap_pager_free.tqh_first == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next == NULL || + swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) { + swap_pager_needflags |= SWAP_FREE_NEEDED; + tsleep(&swap_pager_free, PVM, "swpfre", 0); pagedaemon_wakeup(); + } } splx(s); } spc = swap_pager_free.tqh_first; TAILQ_REMOVE(&swap_pager_free, spc, spc_list); kva = spc->spc_kva; /* * map our page(s) into kva for I/O */ pmap_qenter(kva, m, count); /* * get the base I/O offset into the swap file */ for (i = 0; i < count; i++) { fidx = m[i]->pindex + paging_pindex; off = swap_pager_block_offset(fidx); /* * set the valid bit */ swb[i]->swb_valid |= (1 << off); /* * and unlock the data structure */ swb[i]->swb_locked--; } /* * Get a swap buffer header and perform the IO */ bp = spc->spc_bp; bzero(bp, sizeof *bp); bp->b_spc = spc; bp->b_vnbufs.le_next = NOLIST; bp->b_flags = B_BUSY | B_PAGING; bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_data = (caddr_t) kva; bp->b_blkno = reqaddr[0]; pbgetvp(swapdev_vp, bp); bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; swapdev_vp->v_numoutput++; /* * If this is an async write we set up additional buffer fields and * place a "cleaning" entry on the inuse queue. */ s = splbio(); if (sync == FALSE) { spc->spc_flags = 0; spc->spc_object = object; for (i = 0; i < count; i++) spc->spc_m[i] = m[i]; spc->spc_count = count; /* * the completion routine for async writes */ bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; object->un_pager.swp.swp_poip++; TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); } else { object->un_pager.swp.swp_poip++; bp->b_flags |= B_CALL; bp->b_iodone = swap_pager_iodone1; } cnt.v_swapout++; cnt.v_swappgsout += count; /* * perform the I/O */ VOP_STRATEGY(bp); if (sync == FALSE) { if ((bp->b_flags & B_DONE) == B_DONE) { swap_pager_sync(); } splx(s); for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_PEND; } return VM_PAGER_PEND; } /* * wait for the sync I/O to complete */ while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "swwrt", 0); } if (bp->b_flags & B_ERROR) { printf("swap_pager: I/O error - pageout failed; blkno %d, size %d, error %d\n", bp->b_blkno, bp->b_bcount, bp->b_error); rv = VM_PAGER_ERROR; } else { rv = VM_PAGER_OK; } object->un_pager.swp.swp_poip--; if (object->un_pager.swp.swp_poip == 0) wakeup(object); if (bp->b_vp) pbrelvp(bp); if (bp->b_flags & B_WANTED) wakeup(bp); splx(s); /* * remove the mapping for kernel virtual */ pmap_qremove(kva, count); /* * if we have written the page, then indicate that the page is clean. */ if (rv == VM_PAGER_OK) { for (i = 0; i < count; i++) { if (rtvals[i] == VM_PAGER_OK) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; /* * optimization, if a page has been read * during the pageout process, we activate it. */ - if ((m[i]->flags & PG_ACTIVE) == 0 && + if ((m[i]->queue != PQ_ACTIVE) && ((m[i]->flags & (PG_WANTED|PG_REFERENCED)) || pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))) { vm_page_activate(m[i]); } } } } else { for (i = 0; i < count; i++) { rtvals[i] = rv; } } if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); return (rv); } static void swap_pager_sync() { register swp_clean_t spc, tspc; register int s; tspc = NULL; if (swap_pager_done.tqh_first == NULL) return; for (;;) { s = splbio(); /* * Look up and removal from done list must be done at splbio() * to avoid conflicts with swap_pager_iodone. */ while ((spc = swap_pager_done.tqh_first) != 0) { pmap_qremove(spc->spc_kva, spc->spc_count); swap_pager_finish(spc); TAILQ_REMOVE(&swap_pager_done, spc, spc_list); goto doclean; } /* * No operations done, thats all we can do for now. */ splx(s); break; /* * The desired page was found to be busy earlier in the scan * but has since completed. */ doclean: if (tspc && tspc == spc) { tspc = NULL; } spc->spc_flags = 0; TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); if (swap_pager_needflags & SWAP_FREE_NEEDED) { wakeup(&swap_pager_free); } if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) pagedaemon_wakeup(); swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT); splx(s); } return; } void swap_pager_finish(spc) register swp_clean_t spc; { vm_object_t object = spc->spc_m[0]->object; int i; object->paging_in_progress -= spc->spc_count; if ((object->paging_in_progress == 0) && (object->flags & OBJ_PIPWNT)) { object->flags &= ~OBJ_PIPWNT; wakeup(object); } /* * If no error, mark as clean and inform the pmap system. If error, * mark as dirty so we will try again. (XXX could get stuck doing * this, should give up after awhile) */ if (spc->spc_flags & SPC_ERROR) { for (i = 0; i < spc->spc_count; i++) { printf("swap_pager_finish: I/O error, clean of page %lx failed\n", (u_long) VM_PAGE_TO_PHYS(spc->spc_m[i])); } } else { for (i = 0; i < spc->spc_count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i])); spc->spc_m[i]->dirty = 0; - if ((spc->spc_m[i]->flags & PG_ACTIVE) == 0 && + if ((spc->spc_m[i]->queue != PQ_ACTIVE) && ((spc->spc_m[i]->flags & PG_WANTED) || pmap_is_referenced(VM_PAGE_TO_PHYS(spc->spc_m[i])))) vm_page_activate(spc->spc_m[i]); } } for (i = 0; i < spc->spc_count; i++) { /* * we wakeup any processes that are waiting on these pages. */ PAGE_WAKEUP(spc->spc_m[i]); } nswiodone -= spc->spc_count; return; } /* * swap_pager_iodone */ static void swap_pager_iodone(bp) register struct buf *bp; { register swp_clean_t spc; int s; s = splbio(); spc = (swp_clean_t) bp->b_spc; TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list); if (bp->b_flags & B_ERROR) { spc->spc_flags |= SPC_ERROR; printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n", (bp->b_flags & B_READ) ? "pagein" : "pageout", (u_long) bp->b_blkno, bp->b_bcount, bp->b_error); } if (bp->b_vp) pbrelvp(bp); if (bp->b_flags & B_WANTED) wakeup(bp); if (bp->b_rcred != NOCRED) crfree(bp->b_rcred); if (bp->b_wcred != NOCRED) crfree(bp->b_wcred); nswiodone += spc->spc_count; if (--spc->spc_object->un_pager.swp.swp_poip == 0) { wakeup(spc->spc_object); } if ((swap_pager_needflags & SWAP_FREE_NEEDED) || swap_pager_inuse.tqh_first == 0) { swap_pager_needflags &= ~SWAP_FREE_NEEDED; wakeup(&swap_pager_free); } if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) { swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT; pagedaemon_wakeup(); } if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } if ((swap_pager_inuse.tqh_first == NULL) || ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min && nswiodone + cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min)) { pagedaemon_wakeup(); } splx(s); } Index: head/sys/vm/vm_fault.c =================================================================== --- head/sys/vm/vm_fault.c (revision 13489) +++ head/sys/vm/vm_fault.c (revision 13490) @@ -1,1000 +1,998 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_fault.c,v 1.38 1995/12/07 12:48:10 davidg Exp $ + * $Id: vm_fault.c,v 1.39 1995/12/11 04:58:06 dyson Exp $ */ /* * Page fault handling module. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int vm_fault_additional_pages __P((vm_page_t, int, int, vm_page_t *, int *)); #define VM_FAULT_READ_AHEAD 4 #define VM_FAULT_READ_BEHIND 3 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1) /* * vm_fault: * * Handle a page fault occuring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(map, vaddr, fault_type, change_wiring) vm_map_t map; vm_offset_t vaddr; vm_prot_t fault_type; boolean_t change_wiring; { vm_object_t first_object; vm_pindex_t first_pindex; vm_map_entry_t entry; register vm_object_t object; register vm_pindex_t pindex; vm_page_t m; vm_page_t first_m; vm_prot_t prot; int result; boolean_t wired; boolean_t su; boolean_t lookup_still_valid; vm_page_t old_m; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ]; int hardfault = 0; struct vnode *vp = NULL; cnt.v_vm_faults++; /* needs lock XXX */ /* * Recovery actions */ #define FREE_PAGE(m) { \ PAGE_WAKEUP(m); \ vm_page_free(m); \ } #define RELEASE_PAGE(m) { \ PAGE_WAKEUP(m); \ - if ((m->flags & PG_ACTIVE) == 0) vm_page_activate(m); \ + if (m->queue != PQ_ACTIVE) vm_page_activate(m); \ } #define UNLOCK_MAP { \ if (lookup_still_valid) { \ vm_map_lookup_done(map, entry); \ lookup_still_valid = FALSE; \ } \ } #define UNLOCK_THINGS { \ vm_object_pip_wakeup(object); \ if (object != first_object) { \ FREE_PAGE(first_m); \ vm_object_pip_wakeup(first_object); \ } \ UNLOCK_MAP; \ if (vp != NULL) VOP_UNLOCK(vp); \ } #define UNLOCK_AND_DEALLOCATE { \ UNLOCK_THINGS; \ vm_object_deallocate(first_object); \ } RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, &first_object, &first_pindex, &prot, &wired, &su)) != KERN_SUCCESS) { return (result); } vp = vnode_pager_lock(first_object); lookup_still_valid = TRUE; if (wired) fault_type = prot; first_m = NULL; /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. */ first_object->ref_count++; first_object->paging_in_progress++; /* * INVARIANTS (through entire routine): * * 1) At all times, we must either have the object lock or a busy * page in some object to prevent some other process from trying to * bring in the same page. * * Note that we cannot hold any locks during the pager access or when * waiting for memory, so we use a busy page then. * * Note also that we aren't as concerned about more than one thead * attempting to pager_data_unlock the same page at once, so we don't * hold the page as busy then, but do record the highest unlock value * so far. [Unlock requests may also be delivered out of order.] * * 2) Once we have a busy page, we must remove it from the pageout * queues, so that the pageout daemon will not grab it away. * * 3) To prevent another process from racing us down the shadow chain * and entering a new page in the top object before we do, we must * keep a busy page in the top object while following the shadow * chain. * * 4) We must increment paging_in_progress on any object for which * we have a busy page, to prevent vm_object_collapse from removing * the busy page without our noticing. */ /* * Search for the page at object/offset. */ object = first_object; pindex = first_pindex; /* * See whether this page is resident */ while (TRUE) { m = vm_page_lookup(object, pindex); if (m != NULL) { /* * If the page is being brought in, wait for it and * then retry. */ if ((m->flags & PG_BUSY) || m->busy) { int s; UNLOCK_THINGS; s = splhigh(); if ((m->flags & PG_BUSY) || m->busy) { m->flags |= PG_WANTED | PG_REFERENCED; cnt.v_intrans++; tsleep(m, PSWP, "vmpfw", 0); } splx(s); vm_object_deallocate(first_object); goto RetryFault; } /* * Mark page busy for other processes, and the pagedaemon. */ m->flags |= PG_BUSY; - if ((m->flags & PG_CACHE) && + if ((m->queue == PQ_CACHE) && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_reserved) { UNLOCK_AND_DEALLOCATE; VM_WAIT; PAGE_WAKEUP(m); goto RetryFault; } - if (m->valid && ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && - m->object != kernel_object && m->object != kmem_object) { + if (m->valid && + ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) && + m->object != kernel_object && m->object != kmem_object) { goto readrest; } break; } if (((object->type != OBJT_DEFAULT) && (!change_wiring || wired)) || (object == first_object)) { if (pindex >= object->size) { UNLOCK_AND_DEALLOCATE; return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. */ m = vm_page_alloc(object, pindex, - vp?VM_ALLOC_NORMAL:(VM_ALLOC_NORMAL|VM_ALLOC_ZERO)); + vp?VM_ALLOC_NORMAL:VM_ALLOC_ZERO); if (m == NULL) { UNLOCK_AND_DEALLOCATE; VM_WAIT; goto RetryFault; } } readrest: if (object->type != OBJT_DEFAULT && (!change_wiring || wired)) { int rv; int faultcount; int reqpage; /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. */ faultcount = vm_fault_additional_pages( m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD, marray, &reqpage); /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. */ UNLOCK_MAP; rv = faultcount ? vm_pager_get_pages(object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ m = vm_page_lookup(object, pindex); if( !m) { UNLOCK_AND_DEALLOCATE; goto RetryFault; } hardfault++; break; } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager input (probably hardware) error, PID %d failure\n", curproc->p_pid); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { FREE_PAGE(m); UNLOCK_AND_DEALLOCATE; return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (object != first_object) { FREE_PAGE(m); /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) or the * pager doesn't have the page. */ if (object == first_object) first_m = m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ pindex += OFF_TO_IDX(object->backing_object_offset); next_object = object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (object != first_object) { vm_object_pip_wakeup(object); object = first_object; pindex = first_pindex; m = first_m; } first_m = NULL; if ((m->flags & PG_ZERO) == 0) vm_page_zero_fill(m); m->valid = VM_PAGE_BITS_ALL; cnt.v_zfod++; break; } else { if (object != first_object) { vm_object_pip_wakeup(object); } object = next_object; object->paging_in_progress++; } } if ((m->flags & PG_BUSY) == 0) panic("vm_fault: not busy after main loop"); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ old_m = m; /* save page that would be copied */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (object != first_object) { /* * We only really need to copy if we want to write it. */ if (fault_type & VM_PROT_WRITE) { /* * If we try to collapse first_object at this point, * we may deadlock when we try to get the lock on an * intermediate object (since we have the bottom * object locked). We can't unlock the bottom object, * because the page we found may move (by collapse) if * we do. * * Instead, we first copy the page. Then, when we have * no more use for the bottom object, we unlock it and * try to collapse. * * Note that we copy the page even if we didn't need * to... that's the breaks. */ /* * We already have an empty page in first_object - use * it. */ vm_page_copy(m, first_m); first_m->valid = VM_PAGE_BITS_ALL; /* * If another map is truly sharing this page with us, * we have to flush all uses of the original page, * since we can't distinguish those which want the * original from those which need the new copy. * * XXX If we know that only one map has access to this * page, then we could avoid the pmap_page_protect() * call. */ - if ((m->flags & PG_ACTIVE) == 0) + if (m->queue != PQ_ACTIVE) vm_page_activate(m); - vm_page_protect(m, VM_PROT_NONE); /* * We no longer need the old page or object. */ PAGE_WAKEUP(m); vm_object_pip_wakeup(object); /* * Only use the new page below... */ cnt.v_cow_faults++; m = first_m; object = first_object; pindex = first_pindex; /* * Now that we've gotten the copy out of the way, * let's try to collapse the top object. * * But we have to play ugly games with * paging_in_progress to do that... */ vm_object_pip_wakeup(object); vm_object_collapse(object); object->paging_in_progress++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; /* * Since map entries may be pageable, make sure we can take a * page fault on them. */ /* * To avoid trying to write_lock the map while another process * has it read_locked (in vm_map_pageable), we do not try for * write permission. If the page is still writable, we will * get write permission. If it is not, or has been marked * needs_copy, we enter the mapping without write permission, * and will merely take another fault. */ result = vm_map_lookup(&map, vaddr, fault_type & ~VM_PROT_WRITE, &entry, &retry_object, &retry_pindex, &retry_prot, &wired, &su); /* * If we don't need the page any longer, put it on the active * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; return (result); } lookup_still_valid = TRUE; if ((retry_object != first_object) || (retry_pindex != first_pindex)) { RELEASE_PAGE(m); UNLOCK_AND_DEALLOCATE; goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } /* * (the various bits we're fiddling with here are locked by the * object's lock) */ /* * It's critically important that a wired-down page be faulted only * once in each map for which it is wired. */ /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter may cause other faults. We don't put the page * back on the active queue until later so that the page-out daemon * won't find us (yet). */ if (prot & VM_PROT_WRITE) { m->flags |= PG_WRITEABLE; m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY; /* * If the fault is a write, we know that this page is being * written NOW. This will save on the pmap_is_modified() calls * later. */ if (fault_type & VM_PROT_WRITE) { m->dirty = VM_PAGE_BITS_ALL; } } m->flags |= PG_MAPPED|PG_REFERENCED; m->flags &= ~PG_ZERO; pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired); #if 0 if (change_wiring == 0 && wired == 0) pmap_prefault(map->pmap, vaddr, entry, first_object); #endif /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (change_wiring) { if (wired) vm_page_wire(m); else vm_page_unwire(m); } else { - if ((m->flags & PG_ACTIVE) == 0) + if (m->queue != PQ_ACTIVE) vm_page_activate(m); } if (curproc && (curproc->p_flag & P_INMEM) && curproc->p_stats) { if (hardfault) { curproc->p_stats->p_ru.ru_majflt++; } else { curproc->p_stats->p_ru.ru_minflt++; } } - if ((m->flags & PG_BUSY) == 0) - printf("page not busy: %d\n", m->pindex); /* * Unlock everything, and return */ PAGE_WAKEUP(m); UNLOCK_AND_DEALLOCATE; return (KERN_SUCCESS); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ int vm_fault_wire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va; register pmap_t pmap; int rv; pmap = vm_map_pmap(map); /* * Inform the physical mapping system that the range of addresses may * not fault, so that page tables and such can be locked down as well. */ pmap_pageable(pmap, start, end, FALSE); /* * We simulate a fault to get the page and enter it in the physical * map. */ for (va = start; va < end; va += PAGE_SIZE) { while( curproc != pageproc && (cnt.v_free_count <= cnt.v_pageout_free_min)) VM_WAIT; rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, TRUE); if (rv) { if (va != start) vm_fault_unwire(map, start, va); return (rv); } } return (KERN_SUCCESS); } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire(map, start, end) vm_map_t map; vm_offset_t start, end; { register vm_offset_t va, pa; register pmap_t pmap; pmap = vm_map_pmap(map); /* * Since the pages are wired down, we must be able to get their * mappings from the physical map system. */ for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa == (vm_offset_t) 0) { panic("unwire: page not in pmap"); } pmap_change_wiring(pmap, va, FALSE); vm_page_unwire(PHYS_TO_VM_PAGE(pa)); } /* * Inform the physical mapping system that the range of addresses may * fault, so that page tables and such may be unwired themselves. */ pmap_pageable(pmap, start, end, TRUE); } /* * Routine: * vm_fault_copy_entry * Function: * Copy all of the pages from a wired-down map entry to another. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) vm_map_t dst_map; vm_map_t src_map; vm_map_entry_t dst_entry; vm_map_entry_t src_entry; { vm_object_t dst_object; vm_object_t src_object; vm_ooffset_t dst_offset; vm_ooffset_t src_offset; vm_prot_t prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; #ifdef lint src_map++; #endif /* lint */ src_object = src_entry->object.vm_object; src_offset = src_entry->offset; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, (vm_size_t) OFF_TO_IDX(dst_entry->end - dst_entry->start)); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; prot = dst_entry->max_protection; /* * Loop through all of the pages in the entry's range, copying each * one from the source object (it should be there) to the destination * object. */ for (vaddr = dst_entry->start, dst_offset = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { /* * Allocate a page in the destination object */ do { dst_m = vm_page_alloc(dst_object, OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_WAIT; } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ src_m = vm_page_lookup(src_object, OFF_TO_IDX(dst_offset + src_offset)); if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); vm_page_copy(src_m, dst_m); /* * Enter it in the pmap... */ dst_m->flags |= PG_WRITEABLE|PG_MAPPED; dst_m->flags &= ~PG_ZERO; pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), prot, FALSE); /* * Mark it no longer busy, and put it on the active list. */ vm_page_activate(dst_m); PAGE_WAKEUP(dst_m); } } /* * This routine checks around the requested page for other pages that * might be able to be faulted in. This routine brackets the viable * pages for the pages to be paged in. * * Inputs: * m, rbehind, rahead * * Outputs: * marray (array of vm_page_t), reqpage (index of requested page) * * Return value: * number of pages in marray */ int vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) vm_page_t m; int rbehind; int rahead; vm_page_t *marray; int *reqpage; { int i; vm_object_t object; vm_pindex_t pindex, startpindex, endpindex, tpindex; vm_offset_t size; vm_page_t rtm; int treqpage; int cbehind, cahead; object = m->object; pindex = m->pindex; /* * if the requested page is not available, then give up now */ if (!vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead)) return 0; if ((cbehind == 0) && (cahead == 0)) { *reqpage = 0; marray[0] = m; return 1; } if (rahead > cahead) { rahead = cahead; } if (rbehind > cbehind) { rbehind = cbehind; } /* * try to do any readahead that we might have free pages for. */ if ((rahead + rbehind) > ((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) { pagedaemon_wakeup(); *reqpage = 0; marray[0] = m; return 1; } /* * scan backward for the read behind pages -- in memory or on disk not * in same object */ tpindex = pindex - 1; if (tpindex < pindex) { if (rbehind > pindex) rbehind = pindex; startpindex = pindex - rbehind; while (tpindex >= startpindex) { if (vm_page_lookup( object, tpindex)) { startpindex = tpindex + 1; break; } if (tpindex == 0) break; tpindex -= 1; } } else { startpindex = pindex; } /* * scan forward for the read ahead pages -- in memory or on disk not * in same object */ tpindex = pindex + 1; endpindex = pindex + (rahead + 1); if (endpindex > object->size) endpindex = object->size; while (tpindex < endpindex) { if ( vm_page_lookup(object, tpindex)) { break; } tpindex += 1; } endpindex = tpindex; /* calculate number of bytes of pages */ size = endpindex - startpindex; /* calculate the page offset of the required page */ treqpage = pindex - startpindex; /* see if we have space (again) */ if ((cnt.v_free_count + cnt.v_cache_count) > (cnt.v_free_reserved + size)) { /* * get our pages and don't block for them */ for (i = 0; i < size; i++) { if (i != treqpage) { rtm = vm_page_alloc(object, startpindex + i, VM_ALLOC_NORMAL); if (rtm == NULL) { if (i < treqpage) { int j; for (j = 0; j < i; j++) { FREE_PAGE(marray[j]); } *reqpage = 0; marray[0] = m; return 1; } else { size = i; *reqpage = treqpage; return size; } } marray[i] = rtm; } else { marray[i] = m; } } *reqpage = treqpage; return size; } *reqpage = 0; marray[0] = m; return 1; } Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 13489) +++ head/sys/vm/vm_glue.c (revision 13490) @@ -1,582 +1,633 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_glue.c,v 1.33 1995/12/14 09:54:57 phk Exp $ + * $Id: vm_glue.c,v 1.35 1996/01/04 21:13:14 wollman Exp $ */ #include "opt_sysvipc.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits __P((void *)); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler __P((void *)); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL) static void swapout __P((struct proc *)); extern char kstack[]; /* vm_map_t upages_map; */ int kernacc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; saddr = trunc_page(addr); eaddr = round_page(addr + len); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); return (rv == TRUE); } int useracc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; /* * XXX - check separately to disallow access to user area and user * page tables - they are in the map. * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once * only used (as an end address) in trap.c. Use it as an end address * here too. This bogusness has spread. I just fixed where it was * used as a max in vm_mmap.c. */ if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS || (vm_offset_t) addr + len < (vm_offset_t) addr) { return (FALSE); } rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr + len), prot); return (rv == TRUE); } #ifdef KGDB /* * Change protections on kernel pages from addr to addr+len * (presumably so debugger can plant a breakpoint). * All addresses are assumed to reside in the Sysmap, */ chgkprot(addr, len, rw) register caddr_t addr; int len, rw; { vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; vm_map_protect(kernel_map, trunc_page(addr), round_page(addr + len), prot, FALSE); } #endif void vslock(addr, len) caddr_t addr; u_int len; { vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr + len), FALSE); } void vsunlock(addr, len, dirtied) caddr_t addr; u_int len; int dirtied; { #ifdef lint dirtied++; #endif /* lint */ vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), round_page(addr + len), TRUE); } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. * NOTE: the kernel stack may be at a different location in the child * process, and thus addresses of automatic variables may be invalid * after cpu_fork returns in the child process. We do nothing here * after cpu_fork returns. */ int vm_fork(p1, p2, isvfork) register struct proc *p1, *p2; int isvfork; { register struct user *up; - vm_offset_t addr, ptaddr; + vm_offset_t addr, ptaddr, ptpa; int error, i; - struct vm_map *vp; + vm_map_t vp; + pmap_t pvp; + vm_page_t stkm; while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { VM_WAIT; } /* * avoid copying any of the parent's pagetables or other per-process * objects that reside in the map by marking all of them * non-inheritable */ (void) vm_map_inherit(&p1->p_vmspace->vm_map, UPT_MIN_ADDRESS - UPAGES * PAGE_SIZE, VM_MAX_ADDRESS, VM_INHERIT_NONE); p2->p_vmspace = vmspace_fork(p1->p_vmspace); #ifdef SYSVSHM if (p1->p_vmspace->vm_shm) shmfork(p1, p2, isvfork); #endif /* * Allocate a wired-down (for now) pcb and kernel stack for the * process */ addr = (vm_offset_t) kstack; vp = &p2->p_vmspace->vm_map; + pvp = &p2->p_vmspace->vm_pmap; /* get new pagetables and kernel stack */ - (void) vm_map_find(vp, NULL, 0, &addr, UPT_MAX_ADDRESS - addr, FALSE); + (void) vm_map_find(vp, NULL, 0, &addr, UPT_MAX_ADDRESS - addr, FALSE, + VM_PROT_ALL, VM_PROT_ALL, 0); - /* force in the page table encompassing the UPAGES */ - ptaddr = trunc_page((u_int) vtopte(addr)); - error = vm_map_pageable(vp, ptaddr, ptaddr + PAGE_SIZE, FALSE); - if (error) - panic("vm_fork: wire of PT failed. error=%d", error); - - /* and force in (demand-zero) the UPAGES */ - error = vm_map_pageable(vp, addr, addr + UPAGES * PAGE_SIZE, FALSE); - if (error) - panic("vm_fork: wire of UPAGES failed. error=%d", error); - /* get a kernel virtual address for the UPAGES for this proc */ up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE); if (up == NULL) panic("vm_fork: u_map allocation failed"); - /* and force-map the upages into the kernel pmap */ - for (i = 0; i < UPAGES; i++) - pmap_kenter(((vm_offset_t) up) + PAGE_SIZE * i, - pmap_extract(vp->pmap, addr + PAGE_SIZE * i)); + p2->p_vmspace->vm_upages_obj = vm_object_allocate( OBJT_DEFAULT, + UPAGES); + ptaddr = trunc_page((u_int) vtopte(kstack)); + (void) vm_fault(vp, ptaddr, VM_PROT_READ|VM_PROT_WRITE, FALSE); + ptpa = pmap_extract(pvp, ptaddr); + if (ptpa == 0) { + panic("vm_fork: no pte for UPAGES"); + } + stkm = PHYS_TO_VM_PAGE(ptpa); + vm_page_hold(stkm); + + for(i=0;ip_vmspace->vm_upages_obj, i, VM_ALLOC_ZERO)) == NULL) { + VM_WAIT; + } + + vm_page_wire(m); + m->flags &= ~PG_BUSY; + pmap_enter( pvp, (vm_offset_t) kstack + i * PAGE_SIZE, + VM_PAGE_TO_PHYS(m), VM_PROT_READ|VM_PROT_WRITE, 1); + pmap_kenter(((vm_offset_t) up) + i * PAGE_SIZE, + VM_PAGE_TO_PHYS(m)); + if ((m->flags & PG_ZERO) == 0) + bzero(((caddr_t) up) + i * PAGE_SIZE, PAGE_SIZE); + m->flags &= ~PG_ZERO; + m->valid = VM_PAGE_BITS_ALL; + } + vm_page_unhold(stkm); + p2->p_addr = up; /* * p_stats and p_sigacts currently point at fields in the user struct * but not at &u, instead at p_addr. Copy p_sigacts and parts of * p_stats; zero the rest of p_stats (statistics). */ p2->p_stats = &up->u_stats; p2->p_sigacts = &up->u_sigacts; up->u_sigacts = *p1->p_sigacts; bzero(&up->u_stats.pstat_startzero, (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - (caddr_t) &up->u_stats.pstat_startzero)); bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, ((caddr_t) &up->u_stats.pstat_endcopy - (caddr_t) &up->u_stats.pstat_startcopy)); /* * cpu_fork will copy and update the kernel stack and pcb, and make * the child ready to run. It marks the child so that it can return * differently than the parent. It returns twice, once in the parent * process and once in the child. */ return (cpu_fork(p1, p2)); } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { register struct proc *p = udata; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } void faultin(p) struct proc *p; { vm_offset_t i; vm_offset_t ptaddr; int s; if ((p->p_flag & P_INMEM) == 0) { - vm_map_t map; + vm_map_t map = &p->p_vmspace->vm_map; + pmap_t pmap = &p->p_vmspace->vm_pmap; + vm_page_t stkm, m; + vm_offset_t ptpa; int error; ++p->p_lock; - map = &p->p_vmspace->vm_map; - /* force the page table encompassing the kernel stack (upages) */ ptaddr = trunc_page((u_int) vtopte(kstack)); - error = vm_map_pageable(map, ptaddr, ptaddr + PAGE_SIZE, FALSE); - if (error) - panic("faultin: wire of PT failed. error=%d", error); + (void) vm_fault(map, ptaddr, VM_PROT_READ|VM_PROT_WRITE, FALSE); + ptpa = pmap_extract(&p->p_vmspace->vm_pmap, ptaddr); + if (ptpa == 0) { + panic("vm_fork: no pte for UPAGES"); + } + stkm = PHYS_TO_VM_PAGE(ptpa); + vm_page_hold(stkm); - /* wire in the UPAGES */ - error = vm_map_pageable(map, (vm_offset_t) kstack, - (vm_offset_t) kstack + UPAGES * PAGE_SIZE, FALSE); - if (error) - panic("faultin: wire of UPAGES failed. error=%d", error); + for(i=0;ip_vmspace->vm_pmap, - (vm_offset_t) kstack + off); +retry: + if ((m = vm_page_lookup(p->p_vmspace->vm_upages_obj, i)) == NULL) { + if ((m = vm_page_alloc(p->p_vmspace->vm_upages_obj, i, VM_ALLOC_NORMAL)) == NULL) { + VM_WAIT; + goto retry; + } + } else { + if ((m->flags & PG_BUSY) || m->busy) { + m->flags |= PG_WANTED; + tsleep(m, PVM, "swinuw",0); + goto retry; + } + } + vm_page_wire(m); + if (m->valid == VM_PAGE_BITS_ALL) + m->flags &= ~PG_BUSY; + splx(s); - if (pa == 0) - panic("faultin: missing page for UPAGES\n"); - - pmap_kenter(((vm_offset_t) p->p_addr) + off, pa); + pmap_enter( pmap, (vm_offset_t) kstack + i * PAGE_SIZE, + VM_PAGE_TO_PHYS(m), VM_PROT_READ|VM_PROT_WRITE, TRUE); + pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE, + VM_PAGE_TO_PHYS(m)); + if (m->valid != VM_PAGE_BITS_ALL) { + int rv; + rv = vm_pager_get_pages(p->p_vmspace->vm_upages_obj, + &m, 1, 0); + if (rv != VM_PAGER_OK) + panic("faultin: cannot get upages for proc: %d\n", p->p_pid); + m->valid = VM_PAGE_BITS_ALL; + m->flags &= ~PG_BUSY; + } } + vm_page_unhold(stkm); + s = splhigh(); if (p->p_stat == SRUN) setrunqueue(p); p->p_flag |= P_INMEM; /* undo the effect of setting SLOCK above */ --p->p_lock; splx(s); } } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { register struct proc *p; register int pri; struct proc *pp; int ppri; loop: while ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_reserved + UPAGES + 2)) { VM_WAIT; } pp = NULL; ppri = INT_MIN; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { - if (p->p_stat == SRUN && (p->p_flag & (P_INMEM | P_SWAPPING)) == 0) { + if (p->p_stat == SRUN && + (p->p_flag & (P_INMEM | P_SWAPPING)) == 0) { int mempri; pri = p->p_swtime + p->p_slptime - p->p_nice * 8; mempri = pri > 0 ? pri : 0; /* * if this process is higher priority and there is * enough space, then select this process instead of * the previous selection. */ if (pri > ppri) { pp = p; ppri = pri; } } } /* * Nothing to do, back to sleep */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "sched", 0); goto loop; } /* * We would like to bring someone in. (only if there is space). */ faultin(p); p->p_swtime = 0; goto loop; } #define swappable(p) \ (((p)->p_lock == 0) && \ ((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO|P_SWAPPING)) == P_INMEM) extern int vm_pageout_free_min; /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs() { register struct proc *p; struct proc *outp, *outp2; int outpri, outpri2; int didswap = 0; outp = outp2 = NULL; outpri = outpri2 = INT_MIN; retry: for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if (!swappable(p)) continue; switch (p->p_stat) { default: continue; case SSLEEP: case SSTOP: /* * do not swapout a realtime process */ if (p->p_rtprio.type == RTP_PRIO_REALTIME) continue; /* * do not swapout a process waiting on a critical * event of some kind */ if (((p->p_priority & 0x7f) < PSOCK) || (p->p_slptime <= 4)) continue; vm_map_reference(&p->p_vmspace->vm_map); /* * do not swapout a process that is waiting for VM * datastructures there is a possible deadlock. */ if (!lock_try_write(&p->p_vmspace->vm_map.lock)) { vm_map_deallocate(&p->p_vmspace->vm_map); continue; } vm_map_unlock(&p->p_vmspace->vm_map); /* * If the process has been asleep for awhile and had * most of its pages taken away already, swap it out. */ swapout(p); vm_map_deallocate(&p->p_vmspace->vm_map); didswap++; goto retry; } } /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapout(p) register struct proc *p; { vm_map_t map = &p->p_vmspace->vm_map; + pmap_t pmap = &p->p_vmspace->vm_pmap; vm_offset_t ptaddr; int i; ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = p->p_vmspace->vm_pmap.pm_stats.resident_count; (void) splhigh(); p->p_flag &= ~P_INMEM; p->p_flag |= P_SWAPPING; if (p->p_stat == SRUN) remrq(p); (void) spl0(); /* * let the upages be paged */ - for(i=0;ip_vmspace->vm_upages_obj, i)) == NULL) + panic("swapout: upage already missing???"); + m->dirty = VM_PAGE_BITS_ALL; + vm_page_unwire(m); pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i); - - vm_map_pageable(map, (vm_offset_t) kstack, - (vm_offset_t) kstack + UPAGES * PAGE_SIZE, TRUE); - - ptaddr = trunc_page((u_int) vtopte(kstack)); - vm_map_pageable(map, ptaddr, ptaddr + PAGE_SIZE, TRUE); + } + pmap_remove(pmap, (vm_offset_t) kstack, + (vm_offset_t) kstack + PAGE_SIZE * UPAGES); p->p_flag &= ~P_SWAPPING; p->p_swtime = 0; } #ifdef DDB /* * DEBUG stuff */ int indent; #include /* see subr_prf.c */ /*ARGSUSED2*/ void #if __STDC__ iprintf(const char *fmt,...) #else iprintf(fmt /* , va_alist */ ) char *fmt; /* va_dcl */ #endif { register int i; va_list ap; for (i = indent; i >= 8; i -= 8) printf("\t"); while (--i >= 0) printf(" "); va_start(ap, fmt); printf("%r", fmt, ap); va_end(ap); } #endif /* DDB */ Index: head/sys/vm/vm_kern.c =================================================================== --- head/sys/vm/vm_kern.c (revision 13489) +++ head/sys/vm/vm_kern.c (revision 13490) @@ -1,462 +1,464 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_kern.c,v 1.19 1995/12/10 14:52:09 bde Exp $ + * $Id: vm_kern.c,v 1.20 1995/12/11 04:58:09 dyson Exp $ */ /* * Kernel memory management. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include vm_map_t buffer_map; vm_map_t kernel_map; vm_map_t kmem_map; vm_map_t mb_map; int mb_map_full; vm_map_t io_map; vm_map_t clean_map; vm_map_t phys_map; vm_map_t exec_map; vm_map_t u_map; /* * kmem_alloc_pageable: * * Allocate pageable memory to the kernel's address map. * "map" must be kernel_map or a submap of kernel_map. */ vm_offset_t kmem_alloc_pageable(map, size) vm_map_t map; register vm_size_t size; { vm_offset_t addr; register int result; size = round_page(size); addr = vm_map_min(map); result = vm_map_find(map, NULL, (vm_offset_t) 0, - &addr, size, TRUE); + &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); if (result != KERN_SUCCESS) { return (0); } return (addr); } /* * Allocate wired-down memory in the kernel's address map * or a submap. */ vm_offset_t kmem_alloc(map, size) register vm_map_t map; register vm_size_t size; { vm_offset_t addr; register vm_offset_t offset; vm_offset_t i; size = round_page(size); /* * Use the kernel object for wired-down kernel pages. Assume that no * region of the kernel object is referenced more than once. */ /* * Locate sufficient space in the map. This will give us the final * virtual address for the new memory, and thus will tell us the * offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr)) { vm_map_unlock(map); return (0); } offset = addr - VM_MIN_KERNEL_ADDRESS; vm_object_reference(kernel_object); - vm_map_insert(map, kernel_object, offset, addr, addr + size); + vm_map_insert(map, kernel_object, offset, addr, addr + size, + VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Guarantee that there are pages already in this object before * calling vm_map_pageable. This is to prevent the following * scenario: * * 1) Threads have swapped out, so that there is a pager for the * kernel_object. 2) The kmsg zone is empty, and so we are * kmem_allocing a new page for it. 3) vm_map_pageable calls vm_fault; * there is no page, but there is a pager, so we call * pager_data_request. But the kmsg zone is empty, so we must * kmem_alloc. 4) goto 1 5) Even if the kmsg zone is not empty: when * we get the data back from the pager, it will be (very stale) * non-zero data. kmem_alloc is defined to return zero-filled memory. * * We're intentionally not activating the pages we allocate to prevent a * race with page-out. vm_map_pageable will wire the pages. */ for (i = 0; i < size; i += PAGE_SIZE) { vm_page_t mem; while ((mem = vm_page_alloc(kernel_object, - OFF_TO_IDX(offset + i), - (VM_ALLOC_NORMAL|VM_ALLOC_ZERO))) == NULL) { + OFF_TO_IDX(offset + i), VM_ALLOC_ZERO)) == NULL) { VM_WAIT; } if ((mem->flags & PG_ZERO) == 0) vm_page_zero_fill(mem); mem->flags &= ~(PG_BUSY|PG_ZERO); mem->valid = VM_PAGE_BITS_ALL; } /* * And finally, mark the data as non-pageable. */ (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); /* * Try to coalesce the map */ vm_map_simplify(map, addr); return (addr); } /* * kmem_free: * * Release a region of kernel virtual memory allocated * with kmem_alloc, and return the physical pages * associated with that region. */ void kmem_free(map, addr, size) vm_map_t map; register vm_offset_t addr; vm_size_t size; { (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); } /* * kmem_suballoc: * * Allocates a map to manage a subrange * of the kernel virtual address space. * * Arguments are as follows: * * parent Map to take range from * size Size of range to find * min, max Returned endpoints of map * pageable Can the region be paged */ vm_map_t kmem_suballoc(parent, min, max, size, pageable) register vm_map_t parent; vm_offset_t *min, *max; register vm_size_t size; boolean_t pageable; { register int ret; vm_map_t result; size = round_page(size); *min = (vm_offset_t) vm_map_min(parent); ret = vm_map_find(parent, NULL, (vm_offset_t) 0, - min, size, TRUE); + min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); if (ret != KERN_SUCCESS) { printf("kmem_suballoc: bad status return of %d.\n", ret); panic("kmem_suballoc"); } *max = *min + size; pmap_reference(vm_map_pmap(parent)); result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable); if (result == NULL) panic("kmem_suballoc: cannot create submap"); if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS) panic("kmem_suballoc: unable to change range to submap"); return (result); } /* * Allocate wired-down memory in the kernel's address map for the higher * level kernel memory allocator (kern/kern_malloc.c). We cannot use * kmem_alloc() because we may need to allocate memory at interrupt * level where we cannot block (canwait == FALSE). * * This routine has its own private kernel submap (kmem_map) and object * (kmem_object). This, combined with the fact that only malloc uses * this routine, ensures that we will never block in map or object waits. * * Note that this still only works in a uni-processor environment and * when called at splhigh(). * * We don't worry about expanding the map (adding entries) since entries * for wired maps are statically allocated. */ vm_offset_t kmem_malloc(map, size, waitflag) register vm_map_t map; register vm_size_t size; boolean_t waitflag; { register vm_offset_t offset, i; vm_map_entry_t entry; vm_offset_t addr; vm_page_t m; if (map != kmem_map && map != mb_map) panic("kmem_malloc: map != {kmem,mb}_map"); size = round_page(size); addr = vm_map_min(map); /* * Locate sufficient space in the map. This will give us the final * virtual address for the new memory, and thus will tell us the * offset within the kernel map. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr)) { vm_map_unlock(map); if (map == mb_map) { mb_map_full = TRUE; log(LOG_ERR, "mb_map full\n"); return (0); } if (waitflag == M_WAITOK) panic("kmem_malloc: kmem_map too small"); return (0); } offset = addr - vm_map_min(kmem_map); vm_object_reference(kmem_object); - vm_map_insert(map, kmem_object, offset, addr, addr + size); + vm_map_insert(map, kmem_object, offset, addr, addr + size, + VM_PROT_ALL, VM_PROT_ALL, 0); /* * If we can wait, just mark the range as wired (will fault pages as * necessary). */ if (waitflag == M_WAITOK) { vm_map_unlock(map); (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); vm_map_simplify(map, addr); return (addr); } /* * If we cannot wait then we must allocate all memory up front, * pulling it off the active queue to prevent pageout. */ for (i = 0; i < size; i += PAGE_SIZE) { m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), (waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM); /* * Ran out of space, free everything up and return. Don't need * to lock page queues here as we know that the pages we got * aren't on any queues. */ if (m == NULL) { while (i != 0) { i -= PAGE_SIZE; m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); vm_page_free(m); } vm_map_delete(map, addr, addr + size); vm_map_unlock(map); return (0); } m->flags &= ~(PG_BUSY|PG_ZERO); m->valid = VM_PAGE_BITS_ALL; } /* * Mark map entry as non-pageable. Assert: vm_map_insert() will never * be able to extend the previous entry so there will be a new entry * exactly corresponding to this address range and it will have * wired_count == 0. */ if (!vm_map_lookup_entry(map, addr, &entry) || entry->start != addr || entry->end != addr + size || entry->wired_count) panic("kmem_malloc: entry not found or misaligned"); entry->wired_count++; /* * Loop thru pages, entering them in the pmap. (We cannot add them to * the wired count without wrapping the vm_page_queue_lock in * splimp...) */ for (i = 0; i < size; i += PAGE_SIZE) { m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); + vm_page_wire(m); pmap_kenter(addr + i, VM_PAGE_TO_PHYS(m)); } vm_map_unlock(map); vm_map_simplify(map, addr); return (addr); } /* * kmem_alloc_wait * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * */ vm_offset_t kmem_alloc_wait(map, size) vm_map_t map; vm_size_t size; { vm_offset_t addr; size = round_page(size); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, 0, size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); return (0); } vm_map_unlock(map); tsleep(map, PVM, "kmaw", 0); } - vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size); + vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); return (addr); } /* * kmem_free_wakeup * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmem_free_wakeup(map, addr, size) vm_map_t map; vm_offset_t addr; vm_size_t size; { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); wakeup(map); vm_map_unlock(map); } /* * Create the kernel map; insert a mapping covering kernel text, data, bss, * and all space allocated thus far (`boostrap' data). The new map will thus * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and * the range between `start' and `end' as free. */ void kmem_init(start, end) vm_offset_t start, end; { register vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE); vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_offset_t) 0, - VM_MIN_KERNEL_ADDRESS, start); + VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); } Index: head/sys/vm/vm_map.c =================================================================== --- head/sys/vm/vm_map.c (revision 13489) +++ head/sys/vm/vm_map.c (revision 13490) @@ -1,2327 +1,2337 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_map.c,v 1.30 1995/12/14 09:54:59 phk Exp $ + * $Id: vm_map.c,v 1.31 1996/01/04 21:13:17 wollman Exp $ */ /* * Virtual memory mapping module. */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual memory maps provide for the mapping, protection, * and sharing of virtual memory objects. In addition, * this module provides for an efficient virtual copy of * memory from one map to another. * * Synchronization is required prior to most operations. * * Maps consist of an ordered doubly-linked list of simple * entries; a single hint is used to speed up lookups. * * In order to properly represent the sharing of virtual * memory regions among maps, the map structure is bi-level. * Top-level ("address") maps refer to regions of sharable * virtual memory. These regions are implemented as * ("sharing") maps, which then refer to the actual virtual * memory objects. When two address maps "share" memory, * their top-level maps both have references to the same * sharing map. When memory is virtual-copied from one * address map to another, the references in the sharing * maps are actually copied -- no copying occurs at the * virtual memory object level. * * Since portions of maps are specified by start/end addreses, * which may not align with existing map entries, all * routines merely "clip" entries to these start/end values. * [That is, an entry is split into two, bordering at a * start or end value.] Note that these clippings may not * always be necessary (as the two resulting entries are then * not changed); however, the clipping is done for convenience. * No attempt is currently made to "glue back together" two * abutting entries. * * As mentioned above, virtual copy operations are performed * by copying VM object references from one sharing map to * another, and then marking both regions as copy-on-write. * It is important to note that only one writeable reference * to a VM object region exists in any map -- this means that * shadow object creation can be delayed until a write operation * occurs. */ /* * vm_map_startup: * * Initialize the vm_map module. Must be called before * any other vm_map routines. * * Map and entry structures are allocated from the general * purpose memory pool with some exceptions: * * - The kernel map and kmem submap are allocated statically. * - Kernel map entries are allocated out of a static pool. * * These restrictions are necessary since malloc() uses the * maps and requires map entries. */ vm_offset_t kentry_data; vm_size_t kentry_data_size; static vm_map_entry_t kentry_free; static vm_map_t kmap_free; +extern char kstack[]; static int kentry_count; static vm_offset_t mapvm_start, mapvm, mapvmmax; static int mapvmpgcnt; static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t)); static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t)); static vm_map_entry_t vm_map_entry_create __P((vm_map_t)); static void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t)); static void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t)); static void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t)); static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t)); #ifdef notyet static void vm_map_simplify_entry __P((vm_map_t, vm_map_entry_t)); #endif void vm_map_startup() { register int i; register vm_map_entry_t mep; vm_map_t mp; /* * Static map structures for allocation before initialization of * kernel map or kmem map. vm_map_create knows how to deal with them. */ kmap_free = mp = (vm_map_t) kentry_data; i = MAX_KMAP; while (--i > 0) { mp->header.next = (vm_map_entry_t) (mp + 1); mp++; } mp++->header.next = NULL; /* * Form a free list of statically allocated kernel map entries with * the rest. */ kentry_free = mep = (vm_map_entry_t) mp; kentry_count = i = (kentry_data_size - MAX_KMAP * sizeof *mp) / sizeof *mep; while (--i > 0) { mep->next = mep + 1; mep++; } mep->next = NULL; } /* * Allocate a vmspace structure, including a vm_map and pmap, * and initialize those structures. The refcnt is set to 1. * The remaining fields must be initialized by the caller. */ struct vmspace * vmspace_alloc(min, max, pageable) vm_offset_t min, max; int pageable; { register struct vmspace *vm; if (mapvmpgcnt == 0 && mapvm == 0) { int s; mapvmpgcnt = (cnt.v_page_count * sizeof(struct vm_map_entry) + PAGE_SIZE - 1) / PAGE_SIZE; s = splhigh(); mapvm_start = mapvm = kmem_alloc_pageable(kmem_map, mapvmpgcnt * PAGE_SIZE); mapvmmax = mapvm_start + mapvmpgcnt * PAGE_SIZE; splx(s); if (!mapvm) mapvmpgcnt = 0; } MALLOC(vm, struct vmspace *, sizeof(struct vmspace), M_VMMAP, M_WAITOK); bzero(vm, (caddr_t) &vm->vm_startcopy - (caddr_t) vm); vm_map_init(&vm->vm_map, min, max, pageable); pmap_pinit(&vm->vm_pmap); vm->vm_map.pmap = &vm->vm_pmap; /* XXX */ vm->vm_refcnt = 1; return (vm); } void vmspace_free(vm) register struct vmspace *vm; { if (vm->vm_refcnt == 0) panic("vmspace_free: attempt to free already freed vmspace"); if (--vm->vm_refcnt == 0) { + int s, i; + + pmap_remove(&vm->vm_pmap, (vm_offset_t) kstack, (vm_offset_t) kstack+UPAGES*PAGE_SIZE); + /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call * the pmap module to reclaim anything left. */ vm_map_lock(&vm->vm_map); + vm_object_deallocate(vm->vm_upages_obj); (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, vm->vm_map.max_offset); vm_map_unlock(&vm->vm_map); while( vm->vm_map.ref_count != 1) tsleep(&vm->vm_map.ref_count, PVM, "vmsfre", 0); --vm->vm_map.ref_count; pmap_release(&vm->vm_pmap); FREE(vm, M_VMMAP); } } /* * vm_map_create: * * Creates and returns a new empty VM map with * the given physical map structure, and having * the given lower and upper address bounds. */ vm_map_t vm_map_create(pmap, min, max, pageable) pmap_t pmap; vm_offset_t min, max; boolean_t pageable; { register vm_map_t result; if (kmem_map == NULL) { result = kmap_free; kmap_free = (vm_map_t) result->header.next; if (result == NULL) panic("vm_map_create: out of maps"); } else MALLOC(result, vm_map_t, sizeof(struct vm_map), M_VMMAP, M_WAITOK); vm_map_init(result, min, max, pageable); result->pmap = pmap; return (result); } /* * Initialize an existing vm_map structure * such as that in the vmspace structure. * The pmap is set elsewhere. */ void vm_map_init(map, min, max, pageable) register struct vm_map *map; vm_offset_t min, max; boolean_t pageable; { map->header.next = map->header.prev = &map->header; map->nentries = 0; map->size = 0; map->ref_count = 1; map->is_main_map = TRUE; map->min_offset = min; map->max_offset = max; map->entries_pageable = pageable; map->first_free = &map->header; map->hint = &map->header; map->timestamp = 0; lock_init(&map->lock, TRUE); } /* * vm_map_entry_create: [ internal use only ] * * Allocates a VM map entry for insertion. * No entry fields are filled in. This routine is */ static struct vm_map_entry *mappool; static int mappoolcnt; static vm_map_entry_t vm_map_entry_create(map) vm_map_t map; { vm_map_entry_t entry; int i; #define KENTRY_LOW_WATER 64 #define MAPENTRY_LOW_WATER 128 /* * This is a *very* nasty (and sort of incomplete) hack!!!! */ if (kentry_count < KENTRY_LOW_WATER) { if (mapvmpgcnt && mapvm) { vm_page_t m; m = vm_page_alloc(kmem_object, OFF_TO_IDX(mapvm - vm_map_min(kmem_map)), (map == kmem_map) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL); if (m) { int newentries; newentries = (PAGE_SIZE / sizeof(struct vm_map_entry)); vm_page_wire(m); m->flags &= ~PG_BUSY; m->valid = VM_PAGE_BITS_ALL; pmap_enter(vm_map_pmap(kmem_map), mapvm, VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, 1); m->flags |= PG_WRITEABLE|PG_MAPPED; entry = (vm_map_entry_t) mapvm; mapvm += PAGE_SIZE; --mapvmpgcnt; for (i = 0; i < newentries; i++) { vm_map_entry_dispose(kernel_map, entry); entry++; } } } } if (map == kernel_map || map == kmem_map || map == pager_map) { entry = kentry_free; if (entry) { kentry_free = entry->next; --kentry_count; return entry; } entry = mappool; if (entry) { mappool = entry->next; --mappoolcnt; return entry; } } else { entry = mappool; if (entry) { mappool = entry->next; --mappoolcnt; return entry; } MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry), M_VMMAPENT, M_WAITOK); } if (entry == NULL) panic("vm_map_entry_create: out of map entries"); return (entry); } /* * vm_map_entry_dispose: [ internal use only ] * * Inverse of vm_map_entry_create. */ static void vm_map_entry_dispose(map, entry) vm_map_t map; vm_map_entry_t entry; { if ((kentry_count < KENTRY_LOW_WATER) || ((vm_offset_t) entry >= kentry_data && (vm_offset_t) entry < (kentry_data + kentry_data_size)) || ((vm_offset_t) entry >= mapvm_start && (vm_offset_t) entry < mapvmmax)) { entry->next = kentry_free; kentry_free = entry; ++kentry_count; return; } else { if (mappoolcnt < MAPENTRY_LOW_WATER) { entry->next = mappool; mappool = entry; ++mappoolcnt; return; } FREE(entry, M_VMMAPENT); } } /* * vm_map_entry_{un,}link: * * Insert/remove entries from maps. */ #define vm_map_entry_link(map, after_where, entry) \ { \ (map)->nentries++; \ (entry)->prev = (after_where); \ (entry)->next = (after_where)->next; \ (entry)->prev->next = (entry); \ (entry)->next->prev = (entry); \ } #define vm_map_entry_unlink(map, entry) \ { \ (map)->nentries--; \ (entry)->next->prev = (entry)->prev; \ (entry)->prev->next = (entry)->next; \ } /* * vm_map_reference: * * Creates another valid reference to the given map. * */ void vm_map_reference(map) register vm_map_t map; { if (map == NULL) return; map->ref_count++; } /* * vm_map_deallocate: * * Removes a reference from the specified map, * destroying it if no references remain. * The map should not be locked. */ void vm_map_deallocate(map) register vm_map_t map; { register int c; if (map == NULL) return; c = map->ref_count; if (c == 0) panic("vm_map_deallocate: deallocating already freed map"); if (c != 1) { --map->ref_count; wakeup(&map->ref_count); return; } /* * Lock the map, to wait out all other references to it. */ vm_map_lock(map); (void) vm_map_delete(map, map->min_offset, map->max_offset); --map->ref_count; if( map->ref_count != 0) { vm_map_unlock(map); return; } pmap_destroy(map->pmap); FREE(map, M_VMMAP); } /* * vm_map_insert: * * Inserts the given whole VM object into the target * map at the specified address range. The object's * size should match that of the address range. * * Requires that the map be locked, and leaves it so. */ int -vm_map_insert(map, object, offset, start, end) +vm_map_insert(map, object, offset, start, end, prot, max, cow) vm_map_t map; vm_object_t object; vm_ooffset_t offset; vm_offset_t start; vm_offset_t end; + vm_prot_t prot, max; + int cow; { register vm_map_entry_t new_entry; register vm_map_entry_t prev_entry; vm_map_entry_t temp_entry; /* * Check that the start and end points are not bogus. */ if ((start < map->min_offset) || (end > map->max_offset) || (start >= end)) return (KERN_INVALID_ADDRESS); /* * Find the entry prior to the proposed starting address; if it's part * of an existing entry, this range is bogus. */ if (vm_map_lookup_entry(map, start, &temp_entry)) return (KERN_NO_SPACE); prev_entry = temp_entry; /* * Assert that the next entry doesn't overlap the end point. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < end)) return (KERN_NO_SPACE); /* * See if we can avoid creating a new entry by extending one of our * neighbors. */ if (object == NULL) { if ((prev_entry != &map->header) && (prev_entry->end == start) && (map->is_main_map) && (prev_entry->is_a_map == FALSE) && (prev_entry->is_sub_map == FALSE) && (prev_entry->inheritance == VM_INHERIT_DEFAULT) && - (prev_entry->protection == VM_PROT_DEFAULT) && - (prev_entry->max_protection == VM_PROT_DEFAULT) && + (prev_entry->protection == prot) && + (prev_entry->max_protection == max) && (prev_entry->wired_count == 0)) { if (vm_object_coalesce(prev_entry->object.vm_object, OFF_TO_IDX(prev_entry->offset), (vm_size_t) (prev_entry->end - prev_entry->start), (vm_size_t) (end - prev_entry->end))) { /* * Coalesced the two objects - can extend the * previous map entry to include the new * range. */ map->size += (end - prev_entry->end); prev_entry->end = end; return (KERN_SUCCESS); } } } /* * Create a new entry */ new_entry = vm_map_entry_create(map); new_entry->start = start; new_entry->end = end; new_entry->is_a_map = FALSE; new_entry->is_sub_map = FALSE; new_entry->object.vm_object = object; new_entry->offset = offset; - new_entry->copy_on_write = FALSE; - new_entry->needs_copy = FALSE; + if (cow & MAP_COPY_NEEDED) + new_entry->needs_copy = TRUE; + else + new_entry->needs_copy = FALSE; + if (cow & MAP_COPY_ON_WRITE) + new_entry->copy_on_write = TRUE; + else + new_entry->copy_on_write = FALSE; + if (map->is_main_map) { new_entry->inheritance = VM_INHERIT_DEFAULT; - new_entry->protection = VM_PROT_DEFAULT; - new_entry->max_protection = VM_PROT_DEFAULT; + new_entry->protection = prot; + new_entry->max_protection = max; new_entry->wired_count = 0; } /* * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); map->size += new_entry->end - new_entry->start; /* * Update the free space hint */ - if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start)) + if ((map->first_free == prev_entry) && + (prev_entry->end >= new_entry->start)) map->first_free = new_entry; return (KERN_SUCCESS); } /* * SAVE_HINT: * * Saves the specified entry as the hint for * future lookups. */ #define SAVE_HINT(map,value) \ (map)->hint = (value); /* * vm_map_lookup_entry: [ internal use only ] * * Finds the map entry containing (or * immediately preceding) the specified address * in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is * actually contained in the map. */ boolean_t vm_map_lookup_entry(map, address, entry) register vm_map_t map; register vm_offset_t address; vm_map_entry_t *entry; /* OUT */ { register vm_map_entry_t cur; register vm_map_entry_t last; /* * Start looking either from the head of the list, or from the hint. */ cur = map->hint; if (cur == &map->header) cur = cur->next; if (address >= cur->start) { /* * Go from hint to end of list. * * But first, make a quick check to see if we are already looking * at the entry we want (which is usually the case). Note also * that we don't need to save the hint here... it is the same * hint (unless we are at the header, in which case the hint * didn't buy us anything anyway). */ last = &map->header; if ((cur != last) && (cur->end > address)) { *entry = cur; return (TRUE); } } else { /* * Go from start to hint, *inclusively* */ last = cur->next; cur = map->header.next; } /* * Search linearly */ while (cur != last) { if (cur->end > address) { if (address >= cur->start) { /* * Save this lookup for future hints, and * return */ *entry = cur; SAVE_HINT(map, cur); return (TRUE); } break; } cur = cur->next; } *entry = cur->prev; SAVE_HINT(map, *entry); return (FALSE); } /* * Find sufficient space for `length' bytes in the given map, starting at * `start'. The map must be locked. Returns 0 on success, 1 on no space. */ int vm_map_findspace(map, start, length, addr) register vm_map_t map; register vm_offset_t start; vm_size_t length; vm_offset_t *addr; { register vm_map_entry_t entry, next; register vm_offset_t end; if (start < map->min_offset) start = map->min_offset; if (start > map->max_offset) return (1); /* * Look for the first possible address; if there's already something * at this address, we have to start after it. */ if (start == map->min_offset) { if ((entry = map->first_free) != &map->header) start = entry->end; } else { vm_map_entry_t tmp; if (vm_map_lookup_entry(map, start, &tmp)) start = tmp->end; entry = tmp; } /* * Look through the rest of the map, trying to fit a new region in the * gap between existing regions, or after the very last region. */ for (;; start = (entry = next)->end) { /* * Find the end of the proposed new region. Be sure we didn't * go beyond the end of the map, or wrap around the address; * if so, we lose. Otherwise, if this is the last entry, or * if the proposed new region fits before the next entry, we * win. */ end = start + length; if (end > map->max_offset || end < start) return (1); next = entry->next; if (next == &map->header || next->start >= end) break; } SAVE_HINT(map, entry); *addr = start; if (map == kernel_map && round_page(start + length) > kernel_vm_end) pmap_growkernel(round_page(start + length)); return (0); } /* * vm_map_find finds an unallocated region in the target address * map with the given length. The search is defined to be * first-fit from the specified address; the region found is * returned in the same parameter. * */ int -vm_map_find(map, object, offset, addr, length, find_space) +vm_map_find(map, object, offset, addr, length, find_space, prot, max, cow) vm_map_t map; vm_object_t object; vm_ooffset_t offset; vm_offset_t *addr; /* IN/OUT */ vm_size_t length; boolean_t find_space; + vm_prot_t prot, max; + int cow; { register vm_offset_t start; int result, s = 0; start = *addr; if (map == kmem_map) s = splhigh(); vm_map_lock(map); if (find_space) { if (vm_map_findspace(map, start, length, addr)) { vm_map_unlock(map); if (map == kmem_map) splx(s); return (KERN_NO_SPACE); } start = *addr; } - result = vm_map_insert(map, object, offset, start, start + length); + result = vm_map_insert(map, object, offset, + start, start + length, prot, max, cow); vm_map_unlock(map); if (map == kmem_map) splx(s); return (result); } #ifdef notyet /* * vm_map_simplify_entry: [ internal use only ] * * Simplify the given map entry by: * removing extra sharing maps * [XXX maybe later] merging with a neighbor */ static void vm_map_simplify_entry(map, entry) vm_map_t map; vm_map_entry_t entry; { #ifdef lint map++; #endif /* * If this entry corresponds to a sharing map, then see if we can * remove the level of indirection. If it's not a sharing map, then it * points to a VM object, so see if we can merge with either of our * neighbors. */ if (entry->is_sub_map) return; if (entry->is_a_map) { #if 0 vm_map_t my_share_map; int count; my_share_map = entry->object.share_map; count = my_share_map->ref_count; if (count == 1) { /* * Can move the region from entry->start to entry->end * (+ entry->offset) in my_share_map into place of * entry. Later. */ } #endif } else { /* * Try to merge with our neighbors. * * Conditions for merge are: * * 1. entries are adjacent. 2. both entries point to objects * with null pagers. * * If a merge is possible, we replace the two entries with a * single entry, then merge the two objects into a single * object. * * Now, all that is left to do is write the code! */ } } #endif /* * vm_map_clip_start: [ internal use only ] * * Asserts that the given entry begins at or after * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_start(map, entry, startaddr) \ { \ if (startaddr > entry->start) \ _vm_map_clip_start(map, entry, startaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_start(map, entry, start) register vm_map_t map; register vm_map_entry_t entry; register vm_offset_t start; { register vm_map_entry_t new_entry; /* * See if we can simplify this entry first */ /* vm_map_simplify_entry(map, entry); */ /* * Split off the front portion -- note that we must insert the new * entry BEFORE this one, so that this entry has the specified * starting address. */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->end = start; entry->offset += (start - entry->start); entry->start = start; vm_map_entry_link(map, entry->prev, new_entry); if (entry->is_a_map || entry->is_sub_map) vm_map_reference(new_entry->object.share_map); else vm_object_reference(new_entry->object.vm_object); } /* * vm_map_clip_end: [ internal use only ] * * Asserts that the given entry ends at or before * the specified address; if necessary, * it splits the entry into two. */ #define vm_map_clip_end(map, entry, endaddr) \ { \ if (endaddr < entry->end) \ _vm_map_clip_end(map, entry, endaddr); \ } /* * This routine is called only when it is known that * the entry must be split. */ static void _vm_map_clip_end(map, entry, end) register vm_map_t map; register vm_map_entry_t entry; register vm_offset_t end; { register vm_map_entry_t new_entry; /* * Create a new entry and insert it AFTER the specified entry */ new_entry = vm_map_entry_create(map); *new_entry = *entry; new_entry->start = entry->end = end; new_entry->offset += (end - entry->start); vm_map_entry_link(map, entry, new_entry); if (entry->is_a_map || entry->is_sub_map) vm_map_reference(new_entry->object.share_map); else vm_object_reference(new_entry->object.vm_object); } /* * VM_MAP_RANGE_CHECK: [ internal use only ] * * Asserts that the starting and ending region * addresses fall within the valid range of the map. */ #define VM_MAP_RANGE_CHECK(map, start, end) \ { \ if (start < vm_map_min(map)) \ start = vm_map_min(map); \ if (end > vm_map_max(map)) \ end = vm_map_max(map); \ if (start > end) \ start = end; \ } /* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. * * This range must have been created with vm_map_find, * and no other operations may have been performed on this * range prior to calling vm_map_submap. * * Only a limited number of operations can be performed * within this rage after calling vm_map_submap: * vm_fault * [Don't try vm_map_copy!] * * To remove a submapping, one must first remove the * range from the superior map, and then destroy the * submap (if desired). [Better yet, don't try it.] */ int vm_map_submap(map, start, end, submap) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; vm_map_t submap; { vm_map_entry_t entry; register int result = KERN_INVALID_ARGUMENT; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; vm_map_clip_end(map, entry, end); if ((entry->start == start) && (entry->end == end) && (!entry->is_a_map) && (entry->object.vm_object == NULL) && (!entry->copy_on_write)) { entry->is_a_map = FALSE; entry->is_sub_map = TRUE; vm_map_reference(entry->object.sub_map = submap); result = KERN_SUCCESS; } vm_map_unlock(map); return (result); } /* * vm_map_protect: * * Sets the protection of the specified address * region in the target map. If "set_max" is * specified, the maximum protection is to be set; * otherwise, only the current protection is affected. */ int vm_map_protect(map, start, end, new_prot, set_max) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_prot_t new_prot; register boolean_t set_max; { register vm_map_entry_t current; vm_map_entry_t entry; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); } else entry = entry->next; /* * Make a first pass to check for protection violations. */ current = entry; while ((current != &map->header) && (current->start < end)) { if (current->is_sub_map) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } if ((new_prot & current->max_protection) != new_prot) { vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } current = current->next; } /* * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ current = entry; while ((current != &map->header) && (current->start < end)) { vm_prot_t old_prot; vm_map_clip_end(map, current, end); old_prot = current->protection; if (set_max) current->protection = (current->max_protection = new_prot) & old_prot; else current->protection = new_prot; /* * Update physical map if necessary. Worry about copy-on-write * here -- CHECK THIS XXX */ if (current->protection != old_prot) { #define MASK(entry) ((entry)->copy_on_write ? ~VM_PROT_WRITE : \ VM_PROT_ALL) #define max(a,b) ((a) > (b) ? (a) : (b)) if (current->is_a_map) { vm_map_entry_t share_entry; vm_offset_t share_end; vm_map_lock(current->object.share_map); (void) vm_map_lookup_entry( current->object.share_map, current->offset, &share_entry); share_end = current->offset + (current->end - current->start); while ((share_entry != ¤t->object.share_map->header) && (share_entry->start < share_end)) { pmap_protect(map->pmap, (max(share_entry->start, current->offset) - current->offset + current->start), min(share_entry->end, share_end) - current->offset + current->start, current->protection & MASK(share_entry)); share_entry = share_entry->next; } vm_map_unlock(current->object.share_map); } else pmap_protect(map->pmap, current->start, current->end, current->protection & MASK(entry)); #undef max #undef MASK } current = current->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_inherit: * * Sets the inheritance of the specified address * range in the target map. Inheritance * affects how the map will be shared with * child maps at the time of vm_map_fork. */ int vm_map_inherit(map, start, end, new_inheritance) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_inherit_t new_inheritance; { register vm_map_entry_t entry; vm_map_entry_t temp_entry; switch (new_inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: break; default: return (KERN_INVALID_ARGUMENT); } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; vm_map_clip_start(map, entry, start); } else entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->inheritance = new_inheritance; entry = entry->next; } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_pageable: * * Sets the pageability of the specified address * range in the target map. Regions specified * as not pageable require locked-down physical * memory and physical page maps. * * The map must not be locked, but a reference * must remain to the map throughout the call. */ int vm_map_pageable(map, start, end, new_pageable) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register boolean_t new_pageable; { register vm_map_entry_t entry; vm_map_entry_t start_entry; register vm_offset_t failed = 0; int rv; vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); /* * Only one pageability change may take place at one time, since * vm_fault assumes it will be called only once for each * wiring/unwiring. Therefore, we have to make sure we're actually * changing the pageability for the entire region. We do so before * making any changes. */ if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } entry = start_entry; /* * Actions are rather different for wiring and unwiring, so we have * two separate cases. */ if (new_pageable) { vm_map_clip_start(map, entry, start); /* * Unwiring. First ensure that the range to be unwired is * really wired down and that there are no holes. */ while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0 || (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end))) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } entry = entry->next; } /* * Now decrement the wiring count for each region. If a region * becomes completely unwired, unwire its physical pages and * mappings. */ lock_set_recursive(&map->lock); entry = start_entry; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); entry->wired_count--; if (entry->wired_count == 0) vm_fault_unwire(map, entry->start, entry->end); entry = entry->next; } lock_clear_recursive(&map->lock); } else { /* * Wiring. We must do this in two passes: * * 1. Holding the write lock, we create any shadow or zero-fill * objects that need to be created. Then we clip each map * entry to the region to be wired and increment its wiring * count. We create objects before clipping the map entries * to avoid object proliferation. * * 2. We downgrade to a read lock, and call vm_fault_wire to * fault in the pages for any newly wired area (wired_count is * 1). * * Downgrading to a read lock for vm_fault_wire avoids a possible * deadlock with another process that may have faulted on one * of the pages to be wired (it would mark the page busy, * blocking us, then in turn block on the map lock that we * hold). Because of problems in the recursive lock package, * we cannot upgrade to a write lock in vm_map_lookup. Thus, * any actions that require the write lock must be done * beforehand. Because we keep the read lock on the map, the * copy-on-write status of the entries we modify here cannot * change. */ /* * Pass 1. */ while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0) { /* * Perform actions of vm_map_lookup that need * the write lock on the map: create a shadow * object for a copy-on-write region, or an * object for a zero-fill region. * * We don't have to do this for entries that * point to sharing maps, because we won't * hold the lock on the sharing map. */ if (!entry->is_a_map && !entry->is_sub_map) { if (entry->needs_copy && ((entry->protection & VM_PROT_WRITE) != 0)) { vm_object_shadow(&entry->object.vm_object, &entry->offset, OFF_TO_IDX(entry->end - entry->start)); entry->needs_copy = FALSE; } else if (entry->object.vm_object == NULL) { entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(entry->end - entry->start)); entry->offset = (vm_offset_t) 0; } } } vm_map_clip_start(map, entry, start); vm_map_clip_end(map, entry, end); entry->wired_count++; /* * Check for holes */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { /* * Found one. Object creation actions do not * need to be undone, but the wired counts * need to be restored. */ while (entry != &map->header && entry->end > start) { entry->wired_count--; entry = entry->prev; } vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); } entry = entry->next; } /* * Pass 2. */ /* * HACK HACK HACK HACK * * If we are wiring in the kernel map or a submap of it, * unlock the map to avoid deadlocks. We trust that the * kernel is well-behaved, and therefore will not do * anything destructive to this region of the map while * we have it unlocked. We cannot trust user processes * to do the same. * * HACK HACK HACK HACK */ if (vm_map_pmap(map) == kernel_pmap) { vm_map_unlock(map); /* trust me ... */ } else { lock_set_recursive(&map->lock); lock_write_to_read(&map->lock); } rv = 0; entry = start_entry; while (entry != &map->header && entry->start < end) { /* * If vm_fault_wire fails for any page we need to undo * what has been done. We decrement the wiring count * for those pages which have not yet been wired (now) * and unwire those that have (later). * * XXX this violates the locking protocol on the map, * needs to be fixed. */ if (rv) entry->wired_count--; else if (entry->wired_count == 1) { rv = vm_fault_wire(map, entry->start, entry->end); if (rv) { failed = entry->start; entry->wired_count--; } } entry = entry->next; } if (vm_map_pmap(map) == kernel_pmap) { vm_map_lock(map); } else { lock_clear_recursive(&map->lock); } if (rv) { vm_map_unlock(map); (void) vm_map_pageable(map, start, failed, TRUE); return (rv); } } vm_map_unlock(map); return (KERN_SUCCESS); } /* * vm_map_clean * * Push any dirty cached pages in the address range to their pager. * If syncio is TRUE, dirty pages are written synchronously. * If invalidate is TRUE, any cached pages are freed as well. * * Returns an error if any part of the specified range is not mapped. */ int vm_map_clean(map, start, end, syncio, invalidate) vm_map_t map; vm_offset_t start; vm_offset_t end; boolean_t syncio; boolean_t invalidate; { register vm_map_entry_t current; vm_map_entry_t entry; vm_size_t size; vm_object_t object; vm_ooffset_t offset; vm_map_lock_read(map); VM_MAP_RANGE_CHECK(map, start, end); if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } /* * Make a first pass to check for holes. */ for (current = entry; current->start < end; current = current->next) { if (current->is_sub_map) { vm_map_unlock_read(map); return (KERN_INVALID_ARGUMENT); } if (end > current->end && (current->next == &map->header || current->end != current->next->start)) { vm_map_unlock_read(map); return (KERN_INVALID_ADDRESS); } } /* * Make a second pass, cleaning/uncaching pages from the indicated * objects as we go. */ for (current = entry; current->start < end; current = current->next) { offset = current->offset + (start - current->start); size = (end <= current->end ? end : current->end) - start; if (current->is_a_map || current->is_sub_map) { register vm_map_t smap; vm_map_entry_t tentry; vm_size_t tsize; smap = current->object.share_map; vm_map_lock_read(smap); (void) vm_map_lookup_entry(smap, offset, &tentry); tsize = tentry->end - offset; if (tsize < size) size = tsize; object = tentry->object.vm_object; offset = tentry->offset + (offset - tentry->start); vm_map_unlock_read(smap); } else { object = current->object.vm_object; } if (object && (object->type == OBJT_VNODE)) { /* * Flush pages if writing is allowed. XXX should we continue * on an error? * * XXX Doing async I/O and then removing all the pages from * the object before it completes is probably a very bad * idea. */ if (current->protection & VM_PROT_WRITE) vm_object_page_clean(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size), syncio, TRUE); if (invalidate) vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size), FALSE); } start += size; } vm_map_unlock_read(map); return (KERN_SUCCESS); } /* * vm_map_entry_unwire: [ internal use only ] * * Make the region specified by this entry pageable. * * The map in question should be locked. * [This is the reason for this routine's existence.] */ static void vm_map_entry_unwire(map, entry) vm_map_t map; register vm_map_entry_t entry; { vm_fault_unwire(map, entry->start, entry->end); entry->wired_count = 0; } /* * vm_map_entry_delete: [ internal use only ] * * Deallocate the given entry from the target map. */ static void vm_map_entry_delete(map, entry) register vm_map_t map; register vm_map_entry_t entry; { if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); vm_map_entry_unlink(map, entry); map->size -= entry->end - entry->start; if (entry->is_a_map || entry->is_sub_map) vm_map_deallocate(entry->object.share_map); else vm_object_deallocate(entry->object.vm_object); vm_map_entry_dispose(map, entry); } /* * vm_map_delete: [ internal use only ] * * Deallocates the given address range from the target * map. * * When called with a sharing map, removes pages from * that region from all physical maps. */ int vm_map_delete(map, start, end) register vm_map_t map; vm_offset_t start; register vm_offset_t end; { register vm_map_entry_t entry; vm_map_entry_t first_entry; /* * Find the start of the region, and clip it */ if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { entry = first_entry; vm_map_clip_start(map, entry, start); /* * Fix the lookup hint now, rather than each time though the * loop. */ SAVE_HINT(map, entry->prev); } /* * Save the free space hint */ if (map->first_free->start >= start) map->first_free = entry->prev; /* * Step through all entries in this region */ while ((entry != &map->header) && (entry->start < end)) { vm_map_entry_t next; register vm_offset_t s, e; register vm_object_t object; vm_map_clip_end(map, entry, end); next = entry->next; s = entry->start; e = entry->end; /* * Unwire before removing addresses from the pmap; otherwise, * unwiring will put the entries back in the pmap. */ object = entry->object.vm_object; if (entry->wired_count != 0) vm_map_entry_unwire(map, entry); /* * If this is a sharing map, we must remove *all* references * to this data, since we can't find all of the physical maps * which are sharing it. */ if (object == kernel_object || object == kmem_object) vm_object_page_remove(object, OFF_TO_IDX(entry->offset), OFF_TO_IDX(entry->offset + (e - s)), FALSE); else if (!map->is_main_map) vm_object_pmap_remove(object, OFF_TO_IDX(entry->offset), OFF_TO_IDX(entry->offset + (e - s))); else pmap_remove(map->pmap, s, e); /* * Delete the entry (which may delete the object) only after * removing all pmap entries pointing to its pages. * (Otherwise, its page frames may be reallocated, and any * modify bits will be set in the wrong object!) */ vm_map_entry_delete(map, entry); entry = next; } return (KERN_SUCCESS); } /* * vm_map_remove: * * Remove the given address range from the target map. * This is the exported form of vm_map_delete. */ int vm_map_remove(map, start, end) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; { register int result, s = 0; if (map == kmem_map) s = splhigh(); vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); result = vm_map_delete(map, start, end); vm_map_unlock(map); if (map == kmem_map) splx(s); return (result); } /* * vm_map_check_protection: * * Assert that the target map allows the specified * privilege on the entire address region given. * The entire region must be allocated. */ boolean_t vm_map_check_protection(map, start, end, protection) register vm_map_t map; register vm_offset_t start; register vm_offset_t end; register vm_prot_t protection; { register vm_map_entry_t entry; vm_map_entry_t tmp_entry; if (!vm_map_lookup_entry(map, start, &tmp_entry)) { return (FALSE); } entry = tmp_entry; while (start < end) { if (entry == &map->header) { return (FALSE); } /* * No holes allowed! */ if (start < entry->start) { return (FALSE); } /* * Check protection associated with entry. */ if ((entry->protection & protection) != protection) { return (FALSE); } /* go to next entry */ start = entry->end; entry = entry->next; } return (TRUE); } /* * vm_map_copy_entry: * * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. */ static void vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry) vm_map_t src_map, dst_map; register vm_map_entry_t src_entry, dst_entry; { vm_pindex_t temp_pindex; if (src_entry->is_sub_map || dst_entry->is_sub_map) return; if (dst_entry->object.vm_object != NULL) printf("vm_map_copy_entry: dst_entry object not NULL!\n"); /* * If our destination map was wired down, unwire it now. */ if (dst_entry->wired_count != 0) vm_map_entry_unwire(dst_map, dst_entry); - /* - * If we're dealing with a sharing map, we must remove the destination - * pages from all maps (since we cannot know which maps this sharing - * map belongs in). - */ - - if (dst_map->is_main_map) - pmap_remove(dst_map->pmap, dst_entry->start, dst_entry->end); - else - vm_object_pmap_remove(dst_entry->object.vm_object, - OFF_TO_IDX(dst_entry->offset), - OFF_TO_IDX(dst_entry->offset + - (dst_entry->end - dst_entry->start))); - if (src_entry->wired_count == 0) { boolean_t src_needs_copy; /* * If the source entry is marked needs_copy, it is already * write-protected. */ if (!src_entry->needs_copy) { boolean_t su; /* * If the source entry has only one mapping, we can * just protect the virtual address range. */ if (!(su = src_map->is_main_map)) { su = (src_map->ref_count == 1); } +#ifdef VM_MAP_OLD if (su) { pmap_protect(src_map->pmap, src_entry->start, src_entry->end, src_entry->protection & ~VM_PROT_WRITE); } else { +#endif vm_object_pmap_copy(src_entry->object.vm_object, OFF_TO_IDX(src_entry->offset), OFF_TO_IDX(src_entry->offset + (src_entry->end - src_entry->start))); +#ifdef VM_MAP_OLD } +#endif } /* * Make a copy of the object. */ temp_pindex = OFF_TO_IDX(dst_entry->offset); vm_object_copy(src_entry->object.vm_object, OFF_TO_IDX(src_entry->offset), &dst_entry->object.vm_object, &temp_pindex, &src_needs_copy); dst_entry->offset = IDX_TO_OFF(temp_pindex); /* * If we didn't get a copy-object now, mark the source map * entry so that a shadow will be created to hold its changed * pages. */ if (src_needs_copy) src_entry->needs_copy = TRUE; /* * The destination always needs to have a shadow created. */ dst_entry->needs_copy = TRUE; /* * Mark the entries copy-on-write, so that write-enabling the * entry won't make copy-on-write pages writable. */ src_entry->copy_on_write = TRUE; dst_entry->copy_on_write = TRUE; pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, dst_entry->end - dst_entry->start, src_entry->start); } else { /* * Of course, wired down pages can't be set copy-on-write. * Cause wired pages to be copied into the new map by * simulating faults (the new pages are pageable) */ vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); } } /* * vmspace_fork: * Create a new process vmspace structure and vm_map * based on those of an existing process. The new map * is based on the old map, according to the inheritance * values on the regions in that map. * * The source map must not be locked. */ struct vmspace * vmspace_fork(vm1) register struct vmspace *vm1; { register struct vmspace *vm2; vm_map_t old_map = &vm1->vm_map; vm_map_t new_map; vm_map_entry_t old_entry; vm_map_entry_t new_entry; pmap_t new_pmap; vm_map_lock(old_map); vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, old_map->entries_pageable); bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); new_pmap = &vm2->vm_pmap; /* XXX */ new_map = &vm2->vm_map; /* XXX */ old_entry = old_map->header.next; while (old_entry != &old_map->header) { if (old_entry->is_sub_map) panic("vm_map_fork: encountered a submap"); switch (old_entry->inheritance) { case VM_INHERIT_NONE: break; case VM_INHERIT_SHARE: /* * Clone the entry, referencing the sharing map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->wired_count = 0; ++new_entry->object.vm_object->ref_count; /* * Insert the entry into the new map -- we know we're * inserting at the end of the new map. */ vm_map_entry_link(new_map, new_map->header.prev, new_entry); /* * Update the physical map */ pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, (old_entry->end - old_entry->start), old_entry->start); break; case VM_INHERIT_COPY: /* * Clone the entry and link into the map. */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->is_a_map = FALSE; vm_map_entry_link(new_map, new_map->header.prev, new_entry); - vm_map_copy_entry(old_map, new_map, old_entry, new_entry); + vm_map_copy_entry(old_map, new_map, old_entry, + new_entry); break; } old_entry = old_entry->next; } new_map->size = old_map->size; vm_map_unlock(old_map); return (vm2); } /* * vm_map_lookup: * * Finds the VM object, offset, and * protection for a given virtual address in the * specified map, assuming a page fault of the * type specified. * * Leaves the map in question locked for read; return * values are guaranteed until a vm_map_lookup_done * call is performed. Note that the map argument * is in/out; the returned map must be used in * the call to vm_map_lookup_done. * * A handle (out_entry) is returned for use in * vm_map_lookup_done, to make that fast. * * If a lookup is requested with "write protection" * specified, the map may be changed to perform virtual * copying operations, although the data referenced will * remain the same. */ int vm_map_lookup(var_map, vaddr, fault_type, out_entry, object, pindex, out_prot, wired, single_use) vm_map_t *var_map; /* IN/OUT */ register vm_offset_t vaddr; register vm_prot_t fault_type; vm_map_entry_t *out_entry; /* OUT */ vm_object_t *object; /* OUT */ vm_pindex_t *pindex; /* OUT */ vm_prot_t *out_prot; /* OUT */ boolean_t *wired; /* OUT */ boolean_t *single_use; /* OUT */ { vm_map_t share_map; vm_offset_t share_offset; register vm_map_entry_t entry; register vm_map_t map = *var_map; register vm_prot_t prot; register boolean_t su; RetryLookup:; /* * Lookup the faulting address. */ vm_map_lock_read(map); #define RETURN(why) \ { \ vm_map_unlock_read(map); \ return(why); \ } /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ entry = map->hint; *out_entry = entry; if ((entry == &map->header) || (vaddr < entry->start) || (vaddr >= entry->end)) { vm_map_entry_t tmp_entry; /* * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) RETURN(KERN_INVALID_ADDRESS); entry = tmp_entry; *out_entry = entry; } /* * Handle submaps. */ if (entry->is_sub_map) { vm_map_t old_map = map; *var_map = map = entry->object.sub_map; vm_map_unlock_read(old_map); goto RetryLookup; } /* * Check whether this task is allowed to have this page. */ prot = entry->protection; if ((fault_type & (prot)) != fault_type) RETURN(KERN_PROTECTION_FAILURE); /* * If this page is not pageable, we have to get it for all possible * accesses. */ *wired = (entry->wired_count != 0); if (*wired) prot = fault_type = entry->protection; /* * If we don't already have a VM object, track it down. */ su = !entry->is_a_map; if (su) { share_map = map; share_offset = vaddr; } else { vm_map_entry_t share_entry; /* * Compute the sharing map, and offset into it. */ share_map = entry->object.share_map; share_offset = (vaddr - entry->start) + entry->offset; /* * Look for the backing store object and offset */ vm_map_lock_read(share_map); if (!vm_map_lookup_entry(share_map, share_offset, &share_entry)) { vm_map_unlock_read(share_map); RETURN(KERN_INVALID_ADDRESS); } entry = share_entry; } /* * If the entry was copy-on-write, we either ... */ if (entry->needs_copy) { /* * If we want to write the page, we may as well handle that * now since we've got the sharing map locked. * * If we don't need to write the page, we just demote the * permissions allowed. */ if (fault_type & VM_PROT_WRITE) { /* * Make a new object, and place it in the object * chain. Note that no new references have appeared * -- one just moved from the share map to the new * object. */ if (lock_read_to_write(&share_map->lock)) { if (share_map != map) vm_map_unlock_read(map); goto RetryLookup; } vm_object_shadow( &entry->object.vm_object, &entry->offset, OFF_TO_IDX(entry->end - entry->start)); entry->needs_copy = FALSE; lock_write_to_read(&share_map->lock); } else { /* * We're attempting to read a copy-on-write page -- * don't allow writes. */ prot &= (~VM_PROT_WRITE); } } /* * Create an object if necessary. */ if (entry->object.vm_object == NULL) { if (lock_read_to_write(&share_map->lock)) { if (share_map != map) vm_map_unlock_read(map); goto RetryLookup; } entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(entry->end - entry->start)); entry->offset = 0; lock_write_to_read(&share_map->lock); } /* * Return the object/offset from this entry. If the entry was * copy-on-write or empty, it has been fixed up. */ *pindex = OFF_TO_IDX((share_offset - entry->start) + entry->offset); *object = entry->object.vm_object; /* * Return whether this is the only map sharing this data. */ if (!su) { su = (share_map->ref_count == 1); } *out_prot = prot; *single_use = su; return (KERN_SUCCESS); #undef RETURN } /* * vm_map_lookup_done: * * Releases locks acquired by a vm_map_lookup * (according to the handle returned by that lookup). */ void vm_map_lookup_done(map, entry) register vm_map_t map; vm_map_entry_t entry; { /* * If this entry references a map, unlock it first. */ if (entry->is_a_map) vm_map_unlock_read(entry->object.share_map); /* * Unlock the main-level map */ vm_map_unlock_read(map); } /* * Routine: vm_map_simplify * Purpose: * Attempt to simplify the map representation in * the vicinity of the given starting address. * Note: * This routine is intended primarily to keep the * kernel maps more compact -- they generally don't * benefit from the "expand a map entry" technology * at allocation time because the adjacent entry * is often wired down. */ void vm_map_simplify(map, start) vm_map_t map; vm_offset_t start; { vm_map_entry_t this_entry; vm_map_entry_t prev_entry; vm_map_lock(map); if ( (vm_map_lookup_entry(map, start, &this_entry)) && ((prev_entry = this_entry->prev) != &map->header) && (prev_entry->end == start) && (map->is_main_map) && (prev_entry->is_a_map == FALSE) && (prev_entry->is_sub_map == FALSE) && (this_entry->is_a_map == FALSE) && (this_entry->is_sub_map == FALSE) && (prev_entry->inheritance == this_entry->inheritance) && (prev_entry->protection == this_entry->protection) && (prev_entry->max_protection == this_entry->max_protection) && (prev_entry->wired_count == this_entry->wired_count) && (prev_entry->copy_on_write == this_entry->copy_on_write) && (prev_entry->needs_copy == this_entry->needs_copy) && (prev_entry->object.vm_object == this_entry->object.vm_object) && ((prev_entry->offset + (prev_entry->end - prev_entry->start)) == this_entry->offset) ) { if (map->first_free == this_entry) map->first_free = prev_entry; if (!this_entry->object.vm_object->paging_in_progress) { SAVE_HINT(map, prev_entry); vm_map_entry_unlink(map, this_entry); prev_entry->end = this_entry->end; vm_object_deallocate(this_entry->object.vm_object); vm_map_entry_dispose(map, this_entry); } } vm_map_unlock(map); } #ifdef DDB /* * vm_map_print: [ debug ] */ void vm_map_print(imap, full, dummy3, dummy4) /* db_expr_t */ int imap; boolean_t full; /* db_expr_t */ int dummy3; char *dummy4; { register vm_map_entry_t entry; register vm_map_t map = (vm_map_t)imap; /* XXX */ iprintf("%s map 0x%x: pmap=0x%x,ref=%d,nentries=%d,version=%d\n", (map->is_main_map ? "Task" : "Share"), (int) map, (int) (map->pmap), map->ref_count, map->nentries, map->timestamp); if (!full && indent) return; indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { iprintf("map entry 0x%x: start=0x%x, end=0x%x, ", (int) entry, (int) entry->start, (int) entry->end); if (map->is_main_map) { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; printf("prot=%x/%x/%s, ", entry->protection, entry->max_protection, inheritance_name[entry->inheritance]); if (entry->wired_count != 0) printf("wired, "); } if (entry->is_a_map || entry->is_sub_map) { printf("share=0x%x, offset=0x%x\n", (int) entry->object.share_map, (int) entry->offset); if ((entry->prev == &map->header) || (!entry->prev->is_a_map) || (entry->prev->object.share_map != entry->object.share_map)) { indent += 2; vm_map_print((int)entry->object.share_map, full, 0, (char *)0); indent -= 2; } } else { printf("object=0x%x, offset=0x%x", (int) entry->object.vm_object, (int) entry->offset); if (entry->copy_on_write) printf(", copy (%s)", entry->needs_copy ? "needed" : "done"); printf("\n"); if ((entry->prev == &map->header) || (entry->prev->is_a_map) || (entry->prev->object.vm_object != entry->object.vm_object)) { indent += 2; vm_object_print((int)entry->object.vm_object, full, 0, (char *)0); indent -= 2; } } } indent -= 2; } #endif Index: head/sys/vm/vm_map.h =================================================================== --- head/sys/vm/vm_map.h (revision 13489) +++ head/sys/vm/vm_map.h (revision 13490) @@ -1,234 +1,241 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_map.h 8.3 (Berkeley) 3/15/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_map.h,v 1.9 1995/12/11 04:58:14 dyson Exp $ + * $Id: vm_map.h,v 1.10 1995/12/14 09:55:00 phk Exp $ */ /* * Virtual memory map module definitions. */ #ifndef _VM_MAP_ #define _VM_MAP_ /* * Types defined: * * vm_map_t the high-level address map data structure. * vm_map_entry_t an entry in an address map. * vm_map_version_t a timestamp of a map, for use with vm_map_lookup */ /* * Objects which live in maps may be either VM objects, or * another map (called a "sharing map") which denotes read-write * sharing with other maps. */ union vm_map_object { struct vm_object *vm_object; /* object object */ struct vm_map *share_map; /* share map */ struct vm_map *sub_map; /* belongs to another map */ }; /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, * and user-exported inheritance and protection information. * Also included is control information for virtual copy operations. */ struct vm_map_entry { struct vm_map_entry *prev; /* previous entry */ struct vm_map_entry *next; /* next entry */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ union vm_map_object object; /* object I point to */ vm_ooffset_t offset; /* offset into object */ boolean_t is_a_map:1, /* Is "object" a map? */ is_sub_map:1, /* Is "object" a submap? */ /* Only in sharing maps: */ copy_on_write:1, /* is data copy-on-write */ needs_copy:1; /* does object need to be copied */ /* Only in task maps: */ vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ int wired_count; /* can be paged if = 0 */ }; /* * Maps are doubly-linked lists of map entries, kept sorted * by address. A single hint is provided to start * searches again from the last successful search, * insertion, or removal. */ struct vm_map { struct pmap *pmap; /* Physical map */ lock_data_t lock; /* Lock for map data */ struct vm_map_entry header; /* List of entries */ int nentries; /* Number of entries */ vm_size_t size; /* virtual size */ boolean_t is_main_map; /* Am I a main map? */ int ref_count; /* Reference count */ vm_map_entry_t hint; /* hint for quick lookups */ vm_map_entry_t first_free; /* First free space hint */ boolean_t entries_pageable; /* map entries pageable?? */ unsigned int timestamp; /* Version number */ #define min_offset header.start #define max_offset header.end }; /* * Shareable process virtual address space. * May eventually be merged with vm_map. * Several fields are temporary (text, data stuff). */ struct vmspace { struct vm_map vm_map; /* VM address map */ struct pmap vm_pmap; /* private physical map */ int vm_refcnt; /* number of references */ caddr_t vm_shm; /* SYS5 shared memory private data XXX */ + vm_object_t vm_upages_obj; /* UPAGES object */ /* we copy from vm_startcopy to the end of the structure on fork */ #define vm_startcopy vm_rssize segsz_t vm_rssize; /* current resident set size in pages */ segsz_t vm_swrss; /* resident set size before last swap */ segsz_t vm_tsize; /* text size (pages) XXX */ segsz_t vm_dsize; /* data size (pages) XXX */ segsz_t vm_ssize; /* stack size (pages) */ caddr_t vm_taddr; /* user virtual address of text XXX */ caddr_t vm_daddr; /* user virtual address of data XXX */ caddr_t vm_maxsaddr; /* user VA at max stack growth */ caddr_t vm_minsaddr; /* user VA at max stack growth */ }; /* * Map versions are used to validate a previous lookup attempt. * * Since lookup operations may involve both a main map and * a sharing map, it is necessary to have a timestamp from each. * [If the main map timestamp has changed, the share_map and * associated timestamp are no longer valid; the map version * does not include a reference for the imbedded share_map.] */ typedef struct { int main_timestamp; vm_map_t share_map; int share_timestamp; } vm_map_version_t; /* * Macros: vm_map_lock, etc. * Function: * Perform locking on the data portion of a map. */ #define vm_map_lock(map) { \ lock_write(&(map)->lock); \ (map)->timestamp++; \ } #define vm_map_unlock(map) lock_write_done(&(map)->lock) #define vm_map_lock_read(map) lock_read(&(map)->lock) #define vm_map_unlock_read(map) lock_read_done(&(map)->lock) /* * Functions implemented as macros */ #define vm_map_min(map) ((map)->min_offset) #define vm_map_max(map) ((map)->max_offset) #define vm_map_pmap(map) ((map)->pmap) /* XXX: number of kernel maps and entries to statically allocate */ #define MAX_KMAP 10 #define MAX_KMAPENT 128 +/* + * Copy-on-write flags for vm_map operations + */ +#define MAP_COPY_NEEDED 0x1 +#define MAP_COPY_ON_WRITE 0x2 + #ifdef KERNEL extern vm_offset_t kentry_data; extern vm_size_t kentry_data_size; boolean_t vm_map_check_protection __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t)); int vm_map_copy __P((vm_map_t, vm_map_t, vm_offset_t, vm_size_t, vm_offset_t, boolean_t, boolean_t)); struct pmap; vm_map_t vm_map_create __P((struct pmap *, vm_offset_t, vm_offset_t, boolean_t)); void vm_map_deallocate __P((vm_map_t)); int vm_map_delete __P((vm_map_t, vm_offset_t, vm_offset_t)); -int vm_map_find __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t)); +int vm_map_find __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int)); int vm_map_findspace __P((vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *)); int vm_map_inherit __P((vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t)); void vm_map_init __P((struct vm_map *, vm_offset_t, vm_offset_t, boolean_t)); -int vm_map_insert __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t)); +int vm_map_insert __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int)); int vm_map_lookup __P((vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *, vm_pindex_t *, vm_prot_t *, boolean_t *, boolean_t *)); void vm_map_lookup_done __P((vm_map_t, vm_map_entry_t)); boolean_t vm_map_lookup_entry __P((vm_map_t, vm_offset_t, vm_map_entry_t *)); int vm_map_pageable __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t)); int vm_map_clean __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t)); int vm_map_protect __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); void vm_map_reference __P((vm_map_t)); int vm_map_remove __P((vm_map_t, vm_offset_t, vm_offset_t)); void vm_map_simplify __P((vm_map_t, vm_offset_t)); void vm_map_startup __P((void)); int vm_map_submap __P((vm_map_t, vm_offset_t, vm_offset_t, vm_map_t)); #endif #endif /* _VM_MAP_ */ Index: head/sys/vm/vm_mmap.c =================================================================== --- head/sys/vm/vm_mmap.c (revision 13489) +++ head/sys/vm/vm_mmap.c (revision 13490) @@ -1,746 +1,739 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 - * $Id: vm_mmap.c,v 1.33 1995/12/13 12:28:39 dyson Exp $ + * $Id: vm_mmap.c,v 1.34 1995/12/17 07:19:57 bde Exp $ */ /* * Mapped file (mmap) interface to VM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif /* ARGSUSED */ int sbrk(p, uap, retval) struct proc *p; struct sbrk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif /* ARGSUSED */ int sstk(p, uap, retval) struct proc *p; struct sstk_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct getpagesize_args { int dummy; }; #endif /* ARGSUSED */ int ogetpagesize(p, uap, retval) struct proc *p; struct getpagesize_args *uap; int *retval; { *retval = PAGE_SIZE; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { caddr_t addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int mmap(p, uap, retval) struct proc *p; register struct mmap_args *uap; int *retval; { register struct filedesc *fdp = p->p_fd; register struct file *fp; struct vnode *vp; vm_offset_t addr; vm_size_t size; vm_prot_t prot, maxprot; caddr_t handle; int flags, error; prot = uap->prot & VM_PROT_ALL; flags = uap->flags; /* * Address (if FIXED) must be page aligned. Size is implicitly rounded * to a page boundary. */ addr = (vm_offset_t) uap->addr; if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) || (ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) return (EINVAL); size = (vm_size_t) round_page(uap->len); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); } /* * XXX if no hint provided for a non-fixed mapping place it after the * end of the largest possible heap. * * There should really be a pmap call to determine a reasonable location. */ if (addr == 0 && (flags & MAP_FIXED) == 0) addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ if (((unsigned) uap->fd) >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) return (EINVAL); vp = (struct vnode *) fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) return (EINVAL); /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; } else { /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) return (EACCES); if (flags & MAP_SHARED) { if (fp->f_flag & FWRITE) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) return (EACCES); } else maxprot |= VM_PROT_WRITE; handle = (caddr_t) vp; } } error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, uap->pos); if (error == 0) *retval = (int) addr; return (error); } #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(p, uap, retval) struct proc *p; register struct ommap_args *uap; int *retval; { struct mmap_args nargs; static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 #define OMAP_INHERIT 0x0800 nargs.addr = uap->addr; nargs.len = uap->len; nargs.prot = cvtbsdprot[uap->prot & 0x7]; nargs.flags = 0; if (uap->flags & OMAP_ANON) nargs.flags |= MAP_ANON; if (uap->flags & OMAP_COPY) nargs.flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) nargs.flags |= MAP_SHARED; else nargs.flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) nargs.flags |= MAP_FIXED; if (uap->flags & OMAP_INHERIT) nargs.flags |= MAP_INHERIT; nargs.fd = uap->fd; nargs.pos = uap->pos; return (mmap(p, &nargs, retval)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { caddr_t addr; int len; int flags; }; #endif int msync(p, uap, retval) struct proc *p; struct msync_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; int flags; vm_map_t map; int rv; map = &p->p_vmspace->vm_map; addr = (vm_offset_t) uap->addr; size = round_page((vm_size_t) uap->len); flags = uap->flags; if (((int) addr & PAGE_MASK) || addr + size < addr || (flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we don't * really keep track of individual mmaps so we approximate by flushing * the range of the map entry containing addr. This can be incorrect * if the region splits or is coalesced with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); vm_map_unlock_read(map); if (rv == FALSE) return (EINVAL); addr = entry->start; size = entry->end - entry->start; } /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { caddr_t addr; int len; }; #endif int munmap(p, uap, retval) register struct proc *p; register struct munmap_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; vm_map_t map; addr = (vm_offset_t) uap->addr; if ((addr & PAGE_MASK) || uap->len < 0) return (EINVAL); size = (vm_size_t) round_page(uap->len); if (size == 0) return (0); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); #ifndef i386 if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) return (EINVAL); #endif if (addr + size < addr) return (EINVAL); map = &p->p_vmspace->vm_map; /* * Make sure entire range is allocated. */ if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) return (EINVAL); /* returns nothing but KERN_SUCCESS anyway */ (void) vm_map_remove(map, addr, addr + size); return (0); } void munmapfd(p, fd) struct proc *p; int fd; { /* * XXX should unmap any regions mapped to this file */ p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { caddr_t addr; int len; int prot; }; #endif int mprotect(p, uap, retval) struct proc *p; struct mprotect_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; register vm_prot_t prot; addr = (vm_offset_t) uap->addr; if ((addr & PAGE_MASK) || uap->len < 0) return (EINVAL); size = (vm_size_t) uap->len; prot = uap->prot & VM_PROT_ALL; switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { caddr_t addr; int len; int behav; }; #endif /* ARGSUSED */ int madvise(p, uap, retval) struct proc *p; struct madvise_args *uap; int *retval; { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { caddr_t addr; int len; char *vec; }; #endif /* ARGSUSED */ int mincore(p, uap, retval) struct proc *p; struct mincore_args *uap; int *retval; { vm_offset_t addr; vm_offset_t end; char *vec; addr = trunc_page((vm_offset_t) uap->addr); end = addr + round_page((vm_size_t) uap->len); if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS) return (EINVAL); if (end < addr) return (EINVAL); vec = uap->vec; while(addr < end) { int error; if (pmap_extract(&p->p_vmspace->vm_pmap, addr)) { error = subyte( vec, 1); } else { error = subyte( vec, 0); } if (error) return EFAULT; vec++; addr += PAGE_SIZE; } return (0); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { caddr_t addr; size_t len; }; #endif int mlock(p, uap, retval) struct proc *p; struct mlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; int error; addr = (vm_offset_t) uap->addr; if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) return (EINVAL); size = round_page((vm_size_t) uap->len); if (atop(size) + cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #ifdef pmap_wired_count if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) return (EAGAIN); #else error = suser(p->p_ucred, &p->p_acflag); if (error) return (error); #endif error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { caddr_t addr; size_t len; }; #endif int munlock(p, uap, retval) struct proc *p; struct munlock_args *uap; int *retval; { vm_offset_t addr; vm_size_t size; int error; addr = (vm_offset_t) uap->addr; if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) return (EINVAL); #ifndef pmap_wired_count error = suser(p->p_ucred, &p->p_acflag); if (error) return (error); #endif size = round_page((vm_size_t) uap->len); error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE); return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * Internal version of mmap. * Currently used by mmap, exec, and sys5 shared memory. * Handle is either a vnode pointer or NULL for MAP_ANON. */ int vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) register vm_map_t map; register vm_offset_t *addr; register vm_size_t size; vm_prot_t prot, maxprot; register int flags; caddr_t handle; /* XXX should be vp */ vm_ooffset_t foff; { boolean_t fitit; - vm_object_t object; + vm_object_t object, object2; struct vnode *vp = NULL; objtype_t type; int rv = KERN_SUCCESS; - vm_size_t objsize; + vm_ooffset_t objsize; + int docow; struct proc *p = curproc; if (size == 0) return (0); objsize = size = round_page(size); /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; (void) vm_map_remove(map, *addr, *addr + size); } /* * Lookup/allocate object. */ if (flags & MAP_ANON) { type = OBJT_SWAP; /* * Unnamed anonymous regions always start at 0. */ if (handle == 0) foff = 0; } else { vp = (struct vnode *) handle; if (vp->v_type == VCHR) { type = OBJT_DEVICE; handle = (caddr_t) vp->v_rdev; } else { struct vattr vat; int error; error = VOP_GETATTR(vp, &vat, p->p_ucred, p); if (error) return (error); - objsize = vat.va_size; + objsize = round_page(vat.va_size); type = OBJT_VNODE; } } - object = vm_pager_allocate(type, handle, objsize, prot, foff); + object = vm_pager_allocate(type, handle, OFF_TO_IDX(objsize), prot, foff); if (object == NULL) return (type == OBJT_DEVICE ? EINVAL : ENOMEM); - rv = vm_map_find(map, object, foff, addr, size, fitit); + object2 = NULL; + docow = 0; + if ((flags & (MAP_ANON|MAP_SHARED)) == 0 && (type != OBJT_DEVICE)) { + docow = MAP_COPY_ON_WRITE; + if (objsize < size) { + object2 = vm_object_allocate( OBJT_DEFAULT, + OFF_TO_IDX(size - (foff & ~(PAGE_SIZE - 1)))); + object2->backing_object = object; + object2->backing_object_offset = foff; + TAILQ_INSERT_TAIL(&object->shadow_head, + object2, shadow_list); + } else { + docow |= MAP_COPY_NEEDED; + } + } + if (object2) + rv = vm_map_find(map, object2, 0, addr, size, fitit, + prot, maxprot, docow); + else + rv = vm_map_find(map, object, foff, addr, size, fitit, + prot, maxprot, docow); + + if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. */ - vm_object_deallocate(object); + if (object2) + vm_object_deallocate(object2); + else + vm_object_deallocate(object); goto out; } /* - * mmap a COW regular file - */ - if ((flags & (MAP_ANON|MAP_SHARED)) == 0 && (type != OBJT_DEVICE)) { - vm_map_entry_t entry; - if (!vm_map_lookup_entry(map, *addr, &entry)) { - panic("vm_mmap: missing map entry!!!"); - } - entry->copy_on_write = TRUE; - /* - * This will create the processes private object on - * an as needed basis. - */ - entry->needs_copy = TRUE; - - /* - * set pages COW and protect for read access only - */ - vm_object_pmap_copy(object, foff, foff + size); - - } - - /* * "Pre-fault" resident pages. */ - if ((type == OBJT_VNODE) && (map->pmap != NULL)) { + if ((map != kernel_map) && + (type == OBJT_VNODE) && (map->pmap != NULL)) { pmap_object_init_pt(map->pmap, *addr, object, (vm_pindex_t) OFF_TO_IDX(foff), size); } - /* - * Correct protection (default is VM_PROT_ALL). If maxprot is - * different than prot, we must set both explicitly. - */ - rv = KERN_SUCCESS; - if (maxprot != VM_PROT_ALL) - rv = vm_map_protect(map, *addr, *addr + size, maxprot, TRUE); - if (rv == KERN_SUCCESS && prot != maxprot) - rv = vm_map_protect(map, *addr, *addr + size, prot, FALSE); - if (rv != KERN_SUCCESS) { - (void) vm_map_remove(map, *addr, *addr + size); - goto out; - } /* * Shared memory is also shared with children. */ if (flags & MAP_SHARED) { rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { (void) vm_map_remove(map, *addr, *addr + size); goto out; } } out: switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 13489) +++ head/sys/vm/vm_object.c (revision 13490) @@ -1,1393 +1,1445 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_object.c,v 1.61 1996/01/04 18:32:31 davidg Exp $ + * $Id: vm_object.c,v 1.62 1996/01/04 21:13:20 wollman Exp $ */ /* * Virtual memory object module. */ #include "opt_ddb.h" #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB static void DDB_vm_object_check __P((void)); #endif static void _vm_object_allocate __P((objtype_t, vm_size_t, vm_object_t)); #ifdef DDB static int _vm_object_in_map __P((vm_map_t map, vm_object_t object, vm_map_entry_t entry)); static int vm_object_in_map __P((vm_object_t object)); #endif static void vm_object_qcollapse __P((vm_object_t object)); #ifdef not_used static void vm_object_deactivate_pages __P((vm_object_t)); #endif static void vm_object_terminate __P((vm_object_t)); static void vm_object_cache_trim __P((void)); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ int vm_object_cache_max; struct object_q vm_object_cached_list; static int vm_object_cached; struct object_q vm_object_list; static long vm_object_count; vm_object_t kernel_object; vm_object_t kmem_object; static struct vm_object kernel_object_store; static struct vm_object kmem_object_store; extern int vm_pageout_page_count; static long object_collapses; static long object_bypasses; static void _vm_object_allocate(type, size, object) objtype_t type; vm_size_t size; register vm_object_t object; { TAILQ_INIT(&object->memq); TAILQ_INIT(&object->shadow_head); object->type = type; object->size = size; object->ref_count = 1; object->flags = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->handle = NULL; object->paging_offset = (vm_ooffset_t) 0; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; object->last_read = 0; TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); vm_object_count++; } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init() { TAILQ_INIT(&vm_object_cached_list); TAILQ_INIT(&vm_object_list); vm_object_count = 0; vm_object_cache_max = 84; if (cnt.v_page_count > 1000) vm_object_cache_max += (cnt.v_page_count - 1000) / 3; kernel_object = &kernel_object_store; _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); kmem_object = &kmem_object_store; _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(type, size) objtype_t type; vm_size_t size; { register vm_object_t result; result = (vm_object_t) malloc((u_long) sizeof *result, M_VMOBJ, M_WAITOK); _vm_object_allocate(type, size, result); return (result); } /* * vm_object_reference: * * Gets another reference to the given object. */ inline void vm_object_reference(object) register vm_object_t object; { if (object == NULL) return; if (object->ref_count == 0) { if ((object->flags & OBJ_CANPERSIST) == 0) panic("vm_object_reference: non-persistent object with 0 ref_count"); TAILQ_REMOVE(&vm_object_cached_list, object, cached_list); vm_object_cached--; } object->ref_count++; } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(object) vm_object_t object; { vm_object_t temp; while (object != NULL) { if (object->ref_count == 0) panic("vm_object_deallocate: object deallocated too many times"); /* * Lose the reference */ object->ref_count--; if (object->ref_count != 0) { if ((object->ref_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = object->shadow_head.tqh_first; if ((robject != NULL) && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { int s; robject->ref_count += 2; object->ref_count += 2; do { s = splhigh(); while (robject->paging_in_progress) { robject->flags |= OBJ_PIPWNT; tsleep(robject, PVM, "objde1", 0); } while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "objde2", 0); } splx(s); } while( object->paging_in_progress || robject->paging_in_progress); object->ref_count -= 2; robject->ref_count -= 2; if( robject->ref_count == 0) { robject->ref_count += 1; object = robject; continue; } vm_object_collapse(robject); return; } } /* * If there are still references, then we are done. */ return; } if (object->type == OBJT_VNODE) { struct vnode *vp = object->handle; vp->v_flag &= ~VTEXT; } /* * See if this object can persist and has some resident * pages. If so, enter it in the cache. */ if (object->flags & OBJ_CANPERSIST) { if (object->resident_page_count != 0) { vm_object_page_clean(object, 0, 0 ,TRUE, TRUE); TAILQ_INSERT_TAIL(&vm_object_cached_list, object, cached_list); vm_object_cached++; vm_object_cache_trim(); return; } else { object->flags &= ~OBJ_CANPERSIST; } } /* * Make sure no one uses us. */ object->flags |= OBJ_DEAD; temp = object->backing_object; if (temp) TAILQ_REMOVE(&temp->shadow_head, object, shadow_list); vm_object_terminate(object); /* unlocks and deallocates object */ object = temp; } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. */ static void vm_object_terminate(object) register vm_object_t object; { register vm_page_t p; int s; /* * wait for the pageout daemon to be done with the object */ s = splhigh(); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "objtrm", 0); } splx(s); if (object->paging_in_progress != 0) panic("vm_object_deallocate: pageout in progress"); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = object->handle; VOP_LOCK(vp); vm_object_page_clean(object, 0, 0, TRUE, FALSE); vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); VOP_UNLOCK(vp); } /* * Now free the pages. For internal objects, this also removes them * from paging queues. */ while ((p = object->memq.tqh_first) != NULL) { if (p->flags & PG_BUSY) printf("vm_object_terminate: freeing busy page\n"); PAGE_WAKEUP(p); vm_page_free(p); cnt.v_pfree++; } /* * Let the pager know object is dead. */ vm_pager_deallocate(object); TAILQ_REMOVE(&vm_object_list, object, object_list); vm_object_count--; wakeup(object); /* * Free the space for the object. */ free((caddr_t) object, M_VMOBJ); } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. * Leaves page on whatever queue it is currently on. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. */ void vm_object_page_clean(object, start, end, syncio, lockflag) vm_object_t object; vm_pindex_t start; vm_pindex_t end; boolean_t syncio; boolean_t lockflag; { - register vm_page_t p; + register vm_page_t p, np, tp; register vm_offset_t tstart, tend; + vm_pindex_t pi; int s; struct vnode *vp; int runlen; + int maxf; + int chkb; + int maxb; + int i; + vm_page_t maf[vm_pageout_page_count]; + vm_page_t mab[vm_pageout_page_count]; vm_page_t ma[vm_pageout_page_count]; if (object->type != OBJT_VNODE || (object->flags & OBJ_MIGHTBEDIRTY) == 0) return; vp = object->handle; if (lockflag) VOP_LOCK(vp); object->flags |= OBJ_CLEANING; tstart = start; if (end == 0) { tend = object->size; } else { tend = end; } if ((tstart == 0) && (tend == object->size)) { object->flags &= ~(OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); } + for(p = object->memq.tqh_first; p; p = p->listq.tqe_next) + p->flags |= PG_CLEANCHK; - runlen = 0; - for(;tstart < tend; tstart += 1) { -relookup: - p = vm_page_lookup(object, tstart); - if (!p) { - if (runlen > 0) { - vm_pageout_flush(ma, runlen, syncio); - runlen = 0; - } +rescan: + for(p = object->memq.tqh_first; p; p = np) { + np = p->listq.tqe_next; + + pi = p->pindex; + if (((p->flags & PG_CLEANCHK) == 0) || + (pi < tstart) || (pi >= tend) || + (p->valid == 0) || (p->queue == PQ_CACHE)) { + p->flags &= ~PG_CLEANCHK; continue; } - if ((p->valid == 0) || (p->flags & PG_CACHE)) { - if (runlen > 0) { - vm_pageout_flush(ma, runlen, syncio); - runlen = 0; - } + + vm_page_test_dirty(p); + if ((p->dirty & p->valid) == 0) { + p->flags &= ~PG_CLEANCHK; continue; } - vm_page_protect(p, VM_PROT_READ); - s = splhigh(); - while ((p->flags & PG_BUSY) || p->busy) { - if (runlen > 0) { - splx(s); - vm_pageout_flush(ma, runlen, syncio); - runlen = 0; - goto relookup; - } + if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED|PG_REFERENCED; tsleep(p, PVM, "vpcwai", 0); splx(s); - goto relookup; + goto rescan; } splx(s); + + maxf = 0; + for(i=1;iflags & PG_BUSY) || + (tp->flags & PG_CLEANCHK) == 0) + break; + vm_page_test_dirty(tp); + if ((tp->dirty & tp->valid) == 0) { + tp->flags &= ~PG_CLEANCHK; + break; + } + maf[ i - 1 ] = tp; + maxf++; + continue; + } + break; + } - if (p->dirty == 0) - vm_page_test_dirty(p); - - if ((p->valid & p->dirty) != 0) { - ma[runlen] = p; - p->flags |= PG_BUSY; - runlen++; - if (runlen >= vm_pageout_page_count) { - vm_pageout_flush(ma, runlen, syncio); - runlen = 0; + maxb = 0; + chkb = vm_pageout_page_count - maxf; + if (chkb) { + for(i = 1; i < chkb;i++) { + if (tp = vm_page_lookup(object, pi - i)) { + if ((tp->flags & PG_BUSY) || + (tp->flags & PG_CLEANCHK) == 0) + break; + vm_page_test_dirty(tp); + if ((tp->dirty & tp->valid) == 0) { + tp->flags &= ~PG_CLEANCHK; + break; + } + mab[ i - 1 ] = tp; + maxb++; + continue; + } + break; } - } else if (runlen > 0) { - vm_pageout_flush(ma, runlen, syncio); - runlen = 0; } - + + for(i=0;iflags |= PG_BUSY; + ma[index]->flags &= ~PG_CLEANCHK; + vm_page_protect(ma[index], VM_PROT_READ); + } + vm_page_protect(p, VM_PROT_READ); + p->flags |= PG_BUSY; + p->flags &= ~PG_CLEANCHK; + ma[maxb] = p; + for(i=0;iflags |= PG_BUSY; + ma[index]->flags &= ~PG_CLEANCHK; + vm_page_protect(ma[index], VM_PROT_READ); + } + runlen = maxb + maxf + 1; +/* + printf("maxb: %d, maxf: %d, runlen: %d, offset: %d\n", maxb, maxf, runlen, ma[0]->pindex); +*/ + vm_pageout_flush(ma, runlen, 0); + goto rescan; } - if (runlen > 0) { - vm_pageout_flush(ma, runlen, syncio); - } VOP_FSYNC(vp, NULL, syncio, curproc); if (lockflag) VOP_UNLOCK(vp); object->flags &= ~OBJ_CLEANING; return; } #ifdef not_used /* XXX I cannot tell if this should be an exported symbol */ /* * vm_object_deactivate_pages * * Deactivate all pages in the specified object. (Keep its pages * in memory even though it is no longer referenced.) * * The object must be locked. */ static void vm_object_deactivate_pages(object) register vm_object_t object; { register vm_page_t p, next; for (p = object->memq.tqh_first; p != NULL; p = next) { next = p->listq.tqe_next; vm_page_deactivate(p); } } #endif /* * Trim the object cache to size. */ static void vm_object_cache_trim() { register vm_object_t object; while (vm_object_cached > vm_object_cache_max) { object = vm_object_cached_list.tqh_first; vm_object_reference(object); pager_cache(object, FALSE); } } /* * vm_object_pmap_copy: * * Makes all physical pages in the specified * object range copy-on-write. No writeable * references to these pages should remain. * * The object must *not* be locked. */ void vm_object_pmap_copy(object, start, end) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; { register vm_page_t p; if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0) return; for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { vm_page_protect(p, VM_PROT_READ); } object->flags &= ~OBJ_WRITEABLE; } /* * vm_object_pmap_remove: * * Removes all physical pages in the specified * object range from all physical maps. * * The object must *not* be locked. */ void vm_object_pmap_remove(object, start, end) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; { register vm_page_t p; if (object == NULL) return; for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { - vm_page_protect(p, VM_PROT_NONE); + if (p->pindex >= start && p->pindex < end) + vm_page_protect(p, VM_PROT_NONE); } } /* * vm_object_copy: * * Create a new object which is a copy of an existing * object, and mark all of the pages in the existing * object 'copy-on-write'. The new object has one reference. * Returns the new object. * * May defer the copy until later if the object is not backed * up by a non-default pager. */ void vm_object_copy(src_object, src_offset, dst_object, dst_offset, src_needs_copy) register vm_object_t src_object; vm_pindex_t src_offset; vm_object_t *dst_object;/* OUT */ vm_pindex_t *dst_offset;/* OUT */ boolean_t *src_needs_copy; /* OUT */ { if (src_object == NULL) { /* * Nothing to copy */ *dst_object = NULL; *dst_offset = 0; *src_needs_copy = FALSE; return; } /* * Try to collapse the object before copying it. */ if (src_object->handle == NULL && (src_object->type == OBJT_DEFAULT || src_object->type == OBJT_SWAP)) vm_object_collapse(src_object); /* * Make another reference to the object */ src_object->ref_count++; *dst_object = src_object; *dst_offset = src_offset; /* * Must make a shadow when write is desired */ *src_needs_copy = TRUE; return; } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(object, offset, length) vm_object_t *object; /* IN/OUT */ vm_ooffset_t *offset; /* IN/OUT */ vm_size_t length; { register vm_object_t source; register vm_object_t result; source = *object; /* * Allocate a new object with the given length */ if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL) panic("vm_object_shadow: no object for shadowing"); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. */ result->backing_object = source; if (source) TAILQ_INSERT_TAIL(&result->backing_object->shadow_head, result, shadow_list); /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; /* * Return the new things */ *offset = 0; *object = result; } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(object) register vm_object_t object; { register vm_object_t backing_object; register vm_pindex_t backing_offset_index, paging_offset_index; vm_pindex_t backing_object_paging_offset_index; vm_pindex_t new_pindex; register vm_page_t p, pp; register vm_size_t size; backing_object = object->backing_object; if (backing_object->ref_count != 1) return; backing_object->ref_count += 2; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset); paging_offset_index = OFF_TO_IDX(object->paging_offset); size = object->size; p = backing_object->memq.tqh_first; while (p) { vm_page_t next; next = p->listq.tqe_next; - if ((p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) || - !p->valid || p->hold_count || p->wire_count || p->busy) { + if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) || + (p->queue == PQ_CACHE) || !p->valid || p->hold_count || p->wire_count || p->busy) { p = next; continue; } vm_page_protect(p, VM_PROT_NONE); new_pindex = p->pindex - backing_offset_index; if (p->pindex < backing_offset_index || new_pindex >= size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index+p->pindex, 1); vm_page_free(p); } else { pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, paging_offset_index + new_pindex, NULL, NULL))) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index + p->pindex, 1); vm_page_free(p); } else { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, backing_object_paging_offset_index + p->pindex, 1); vm_page_rename(p, object, new_pindex); p->dirty = VM_PAGE_BITS_ALL; } } p = next; } backing_object->ref_count -= 2; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(object) vm_object_t object; { vm_object_t backing_object; vm_ooffset_t backing_offset; vm_size_t size; vm_pindex_t new_pindex, backing_offset_index; vm_page_t p, pp; while (TRUE) { /* * Verify that the conditions are right for collapse: * * The object exists and no pages in it are currently being paged * out. */ if (object == NULL) return; /* * Make sure there is a backing object. */ if ((backing_object = object->backing_object) == NULL) return; /* * we check the backing object first, because it is most likely * not collapsable. */ if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { return; } if (object->paging_in_progress != 0 || backing_object->paging_in_progress != 0) { vm_object_qcollapse(object); return; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) remove * the parent's reference to it. */ backing_offset = object->backing_object_offset; backing_offset_index = OFF_TO_IDX(backing_offset); size = object->size; /* * If there is exactly one reference to the backing object, we * can collapse it into the parent. */ if (backing_object->ref_count == 1) { backing_object->flags |= OBJ_DEAD; /* * We can collapse the backing object. * * Move all in-memory pages from backing_object to the * parent. Pages that have been paged out will be * overwritten by any of the parent's pages that * shadow them. */ while ((p = backing_object->memq.tqh_first) != 0) { new_pindex = p->pindex - backing_offset_index; /* * If the parent has a page here, or if this * page falls outside the parent, dispose of * it. * * Otherwise, move it as planned. */ if (p->pindex < backing_offset_index || new_pindex >= size) { vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } else { pp = vm_page_lookup(object, new_pindex); if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) { vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } else { vm_page_rename(p, object, new_pindex); } } } /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { backing_object->paging_in_progress++; if (object->type == OBJT_SWAP) { object->paging_in_progress++; /* * copy shadow object pages into ours * and destroy unneeded pages in * shadow object. */ swap_pager_copy( backing_object, OFF_TO_IDX(backing_object->paging_offset), object, OFF_TO_IDX(object->paging_offset), OFF_TO_IDX(object->backing_object_offset)); vm_object_pip_wakeup(object); } else { object->paging_in_progress++; /* * move the shadow backing_object's pager data to * "object" and convert "object" type to OBJT_SWAP. */ object->type = OBJT_SWAP; object->un_pager.swp.swp_nblocks = backing_object->un_pager.swp.swp_nblocks; object->un_pager.swp.swp_allocsize = backing_object->un_pager.swp.swp_allocsize; object->un_pager.swp.swp_blocks = backing_object->un_pager.swp.swp_blocks; object->un_pager.swp.swp_poip = /* XXX */ backing_object->un_pager.swp.swp_poip; object->paging_offset = backing_object->paging_offset + backing_offset; TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list); /* * Convert backing object from OBJT_SWAP to * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is * actually necessary. */ backing_object->type = OBJT_DEFAULT; TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list); /* * free unnecessary blocks */ swap_pager_freespace(object, 0, OFF_TO_IDX(object->paging_offset)); vm_object_pip_wakeup(object); } vm_object_pip_wakeup(backing_object); } /* * Object now shadows whatever backing_object did. * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ TAILQ_REMOVE(&object->backing_object->shadow_head, object, shadow_list); if (backing_object->backing_object) TAILQ_REMOVE(&backing_object->backing_object->shadow_head, backing_object, shadow_list); object->backing_object = backing_object->backing_object; if (object->backing_object) TAILQ_INSERT_TAIL(&object->backing_object->shadow_head, object, shadow_list); object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ TAILQ_REMOVE(&vm_object_list, backing_object, object_list); vm_object_count--; free((caddr_t) backing_object, M_VMOBJ); object_collapses++; } else { /* * If all of the pages in the backing object are * shadowed by the parent object, the parent object no * longer has to shadow the backing object; it can * shadow the next one in the chain. * * The backing object must not be paged out - we'd have * to check all of the paged-out pages, as well. */ if (backing_object->type != OBJT_DEFAULT) { return; } /* * Should have a check for a 'small' number of pages * here. */ for (p = backing_object->memq.tqh_first; p; p = p->listq.tqe_next) { new_pindex = p->pindex - backing_offset_index; /* * If the parent has a page here, or if this * page falls outside the parent, keep going. * * Otherwise, the backing_object must be left in * the chain. */ if (p->pindex >= backing_offset_index && new_pindex <= size) { pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) { /* * Page still needed. Can't go any * further. */ return; } } } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ TAILQ_REMOVE(&object->backing_object->shadow_head, object, shadow_list); vm_object_reference(object->backing_object = backing_object->backing_object); if (object->backing_object) TAILQ_INSERT_TAIL(&object->backing_object->shadow_head, object, shadow_list); object->backing_object_offset += backing_object->backing_object_offset; /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish; * so we don't need to call vm_object_deallocate. */ if (backing_object->ref_count == 1) printf("should have called obj deallocate\n"); backing_object->ref_count--; object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: [internal] * * Removes all physical pages in the specified * object range from the object's list of pages. * * The object must be locked. */ void vm_object_page_remove(object, start, end, clean_only) register vm_object_t object; register vm_pindex_t start; register vm_pindex_t end; boolean_t clean_only; { register vm_page_t p, next; unsigned int size; int s; if (object == NULL) return; object->paging_in_progress++; again: size = end - start; if (size > 4 || size >= object->size / 4) { for (p = object->memq.tqh_first; p != NULL; p = next) { next = p->listq.tqe_next; + if (p->wire_count != 0) { + vm_page_protect(p, VM_PROT_NONE); + p->valid = 0; + continue; + } if ((start <= p->pindex) && (p->pindex < end)) { s = splhigh(); - if (p->bmapped) { - splx(s); - continue; - } if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED; tsleep(p, PVM, "vmopar", 0); splx(s); goto again; } splx(s); if (clean_only) { vm_page_test_dirty(p); if (p->valid & p->dirty) continue; } vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } } } else { while (size > 0) { - while ((p = vm_page_lookup(object, start)) != 0) { - s = splhigh(); - if (p->bmapped) { - splx(s); - break; + if ((p = vm_page_lookup(object, start)) != 0) { + if (p->wire_count != 0) { + p->valid = 0; + vm_page_protect(p, VM_PROT_NONE); + start += 1; + size -= 1; + continue; } + s = splhigh(); if ((p->flags & PG_BUSY) || p->busy) { p->flags |= PG_WANTED; tsleep(p, PVM, "vmopar", 0); splx(s); goto again; } splx(s); if (clean_only) { vm_page_test_dirty(p); - if (p->valid & p->dirty) + if (p->valid & p->dirty) { + start += 1; + size -= 1; continue; + } } vm_page_protect(p, VM_PROT_NONE); PAGE_WAKEUP(p); vm_page_free(p); } start += 1; size -= 1; } } vm_object_pip_wakeup(object); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * next_object Second object into coalesce * next_offset Offset into next_object * * prev_size Size of reference to prev_object * next_size Size of reference to next_object * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(prev_object, prev_pindex, prev_size, next_size) register vm_object_t prev_object; vm_pindex_t prev_pindex; vm_size_t prev_size, next_size; { vm_size_t newsize; if (prev_object == NULL) { return (TRUE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->ref_count > 1 || prev_object->type != OBJT_DEFAULT || prev_object->backing_object != NULL) { return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; /* * Remove any pages that may still be in the object from a previous * deallocation. */ vm_object_page_remove(prev_object, prev_pindex + prev_size, prev_pindex + prev_size + next_size, FALSE); /* * Extend the object if necessary. */ newsize = prev_pindex + prev_size + next_size; if (newsize > prev_object->size) prev_object->size = newsize; return (TRUE); } #ifdef DDB static int _vm_object_in_map(map, object, entry) vm_map_t map; vm_object_t object; vm_map_entry_t entry; { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if( _vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->is_sub_map || entry->is_a_map) { tmpm = entry->object.share_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if( _vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (obj = entry->object.vm_object) { for(; obj; obj=obj->backing_object) if( obj == object) { return 1; } } return 0; } static int vm_object_in_map( object) vm_object_t object; { struct proc *p; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; /* if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } */ if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) return 1; } if( _vm_object_in_map( kernel_map, object, 0)) return 1; if( _vm_object_in_map( kmem_map, object, 0)) return 1; if( _vm_object_in_map( pager_map, object, 0)) return 1; if( _vm_object_in_map( buffer_map, object, 0)) return 1; if( _vm_object_in_map( io_map, object, 0)) return 1; if( _vm_object_in_map( phys_map, object, 0)) return 1; if( _vm_object_in_map( mb_map, object, 0)) return 1; if( _vm_object_in_map( u_map, object, 0)) return 1; return 0; } #ifdef DDB static void DDB_vm_object_check() { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ for (object = vm_object_list.tqh_first; object != NULL; object = object->object_list.tqe_next) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { printf("vmochk: internal obj has zero ref count: %d\n", object->size); } if (!vm_object_in_map(object)) { printf("vmochk: internal obj is not in a map: " "ref: %d, size: %d: 0x%x, backing_object: 0x%x\n", object->ref_count, object->size, object->size, object->backing_object); } } } } #endif /* DDB */ /* * vm_object_print: [ debug ] */ void vm_object_print(iobject, full, dummy3, dummy4) /* db_expr_t */ int iobject; boolean_t full; /* db_expr_t */ int dummy3; char *dummy4; { vm_object_t object = (vm_object_t)iobject; /* XXX */ register vm_page_t p; register int count; if (object == NULL) return; iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ", (int) object, (int) object->size, object->resident_page_count, object->ref_count); printf("offset=0x%x, backing_object=(0x%x)+0x%x\n", (int) object->paging_offset, (int) object->backing_object, (int) object->backing_object_offset); printf("cache: next=%p, prev=%p\n", object->cached_list.tqe_next, object->cached_list.tqe_prev); if (!full) return; indent += 2; count = 0; for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { if (count == 0) iprintf("memory:="); else if (count == 6) { printf("\n"); iprintf(" ..."); count = 0; } else printf(","); count++; printf("(off=0x%lx,page=0x%lx)", (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); } if (count != 0) printf("\n"); indent -= 2; } #endif /* DDB */ Index: head/sys/vm/vm_page.c =================================================================== --- head/sys/vm/vm_page.c (revision 13489) +++ head/sys/vm/vm_page.c (revision 13490) @@ -1,1135 +1,1122 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $Id: vm_page.c,v 1.44 1995/12/17 07:19:58 bde Exp $ + * $Id: vm_page.c,v 1.45 1996/01/04 21:13:23 wollman Exp $ */ /* * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Resident memory management module. */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB extern void DDB_print_page_info __P((void)); #endif /* * Associated with page of user-allocatable memory is a * page structure. */ static struct pglist *vm_page_buckets; /* Array of buckets */ static int vm_page_bucket_count; /* How big is array? */ static int vm_page_hash_mask; /* Mask for hash function */ struct pglist vm_page_queue_free; struct pglist vm_page_queue_zero; struct pglist vm_page_queue_active; struct pglist vm_page_queue_inactive; struct pglist vm_page_queue_cache; +int no_queue; + +struct { + struct pglist *pl; + int *cnt; +} vm_page_queues[PQ_CACHE+1] = { + {NULL, &no_queue}, + { &vm_page_queue_free, &cnt.v_free_count}, + { &vm_page_queue_zero, &cnt.v_free_count}, + { &vm_page_queue_inactive, &cnt.v_inactive_count}, + { &vm_page_queue_active, &cnt.v_active_count}, + { &vm_page_queue_cache, &cnt.v_cache_count} +}; + vm_page_t vm_page_array; static int vm_page_array_size; long first_page; static long last_page; static vm_size_t page_mask; static int page_shift; int vm_page_zero_count; /* * map of contiguous valid DEV_BSIZE chunks in a page * (this list is valid for page sizes upto 16*DEV_BSIZE) */ static u_short vm_page_dev_bsize_chunks[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; static inline __pure int vm_page_hash __P((vm_object_t object, vm_pindex_t pindex)) __pure2; static void vm_page_unqueue __P((vm_page_t )); /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. * * Sets page_shift and page_mask from cnt.v_page_size. */ void vm_set_page_size() { if (cnt.v_page_size == 0) cnt.v_page_size = DEFAULT_PAGE_SIZE; page_mask = cnt.v_page_size - 1; if ((page_mask & cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); for (page_shift = 0;; page_shift++) if ((1 << page_shift) == cnt.v_page_size) break; } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(starta, enda, vaddr) register vm_offset_t starta; vm_offset_t enda; register vm_offset_t vaddr; { register vm_offset_t mapped; register vm_page_t m; register struct pglist *bucket; vm_size_t npages, page_range; register vm_offset_t new_start; int i; vm_offset_t pa; int nblocks; vm_offset_t first_managed_page; /* the biggest memory array is the second group of pages */ vm_offset_t start; vm_offset_t biggestone, biggestsize; vm_offset_t total; total = 0; biggestsize = 0; biggestone = 0; nblocks = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } for (i = 0; phys_avail[i + 1]; i += 2) { int size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } ++nblocks; total += size; } start = phys_avail[biggestone]; /* * Initialize the queue headers for the free queue, the active queue * and the inactive queue. */ TAILQ_INIT(&vm_page_queue_free); TAILQ_INIT(&vm_page_queue_zero); TAILQ_INIT(&vm_page_queue_active); TAILQ_INIT(&vm_page_queue_inactive); TAILQ_INIT(&vm_page_queue_cache); /* * Allocate (and initialize) the hash table buckets. * * The number of buckets MUST BE a power of 2, and the actual value is * the next power of 2 greater than the number of physical pages in * the system. * * Note: This computation can be tweaked if desired. */ vm_page_buckets = (struct pglist *) vaddr; bucket = vm_page_buckets; if (vm_page_bucket_count == 0) { - vm_page_bucket_count = 1; + vm_page_bucket_count = 2; while (vm_page_bucket_count < atop(total)) vm_page_bucket_count <<= 1; } vm_page_hash_mask = vm_page_bucket_count - 1; /* * Validate these addresses. */ new_start = start + vm_page_bucket_count * sizeof(struct pglist); new_start = round_page(new_start); mapped = vaddr; vaddr = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; bzero((caddr_t) mapped, vaddr - mapped); mapped = vaddr; for (i = 0; i < vm_page_bucket_count; i++) { TAILQ_INIT(bucket); bucket++; } /* * round (or truncate) the addresses to our page size. */ /* * Pre-allocate maps and map entries that cannot be dynamically * allocated via malloc(). The maps include the kernel_map and * kmem_map which must be initialized before malloc() will work * (obviously). Also could include pager maps which would be * allocated before kmeminit. * * Allow some kernel map entries... this should be plenty since people * shouldn't be cluttering up the kernel map (they should use their * own maps). */ kentry_data_size = MAX_KMAP * sizeof(struct vm_map) + MAX_KMAPENT * sizeof(struct vm_map_entry); kentry_data_size = round_page(kentry_data_size); kentry_data = (vm_offset_t) vaddr; vaddr += kentry_data_size; /* * Validate these zone addresses. */ new_start = start + (vaddr - mapped); pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); bzero((caddr_t) mapped, (vaddr - mapped)); start = round_page(new_start); /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = phys_avail[0] / PAGE_SIZE; last_page = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE; page_range = last_page - (phys_avail[0] / PAGE_SIZE); npages = (total - (page_range * sizeof(struct vm_page)) - (start - phys_avail[biggestone])) / PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ vm_page_array = (vm_page_t) vaddr; mapped = vaddr; /* * Validate these addresses. */ new_start = round_page(start + page_range * sizeof(struct vm_page)); mapped = pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE); start = new_start; first_managed_page = start / PAGE_SIZE; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); vm_page_array_size = page_range; cnt.v_page_count = 0; cnt.v_free_count = 0; for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) { if (i == biggestone) pa = ptoa(first_managed_page); else pa = phys_avail[i]; while (pa < phys_avail[i + 1] && npages-- > 0) { ++cnt.v_page_count; ++cnt.v_free_count; m = PHYS_TO_VM_PAGE(pa); - m->flags = PG_FREE; + m->queue = PQ_FREE; + m->flags = 0; m->phys_addr = pa; TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); pa += PAGE_SIZE; } } return (mapped); } /* * vm_page_hash: * * Distributes the object/offset key pair among hash buckets. * * NOTE: This macro depends on vm_page_bucket_count being a power of 2. */ static inline __pure int vm_page_hash(object, pindex) vm_object_t object; vm_pindex_t pindex; { return ((unsigned) object + pindex) & vm_page_hash_mask; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object/object-page * table and object list. * * The object and page must be locked, and must be splhigh. */ inline void -vm_page_insert(mem, object, pindex) - register vm_page_t mem; +vm_page_insert(m, object, pindex) + register vm_page_t m; register vm_object_t object; register vm_pindex_t pindex; { register struct pglist *bucket; - if (mem->flags & PG_TABLED) + if (m->flags & PG_TABLED) panic("vm_page_insert: already inserted"); /* * Record the object/offset pair in this page */ - mem->object = object; - mem->pindex = pindex; + m->object = object; + m->pindex = pindex; /* * Insert it into the object_object/offset hash table */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; - TAILQ_INSERT_TAIL(bucket, mem, hashq); + TAILQ_INSERT_TAIL(bucket, m, hashq); /* * Now link into the object's list of backed pages. */ - TAILQ_INSERT_TAIL(&object->memq, mem, listq); - mem->flags |= PG_TABLED; + TAILQ_INSERT_TAIL(&object->memq, m, listq); + m->flags |= PG_TABLED; /* * And show that the object has one more resident page. */ object->resident_page_count++; } /* * vm_page_remove: [ internal use only ] * NOTE: used by device pager as well -wfj * * Removes the given mem entry from the object/offset-page * table and the object page list. * * The object and page must be locked, and at splhigh. */ inline void -vm_page_remove(mem) - register vm_page_t mem; +vm_page_remove(m) + register vm_page_t m; { register struct pglist *bucket; - if (!(mem->flags & PG_TABLED)) + if (!(m->flags & PG_TABLED)) return; /* * Remove from the object_object/offset hash table */ - bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->pindex)]; - TAILQ_REMOVE(bucket, mem, hashq); + bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)]; + TAILQ_REMOVE(bucket, m, hashq); /* * Now remove from the object's list of backed pages. */ - TAILQ_REMOVE(&mem->object->memq, mem, listq); + TAILQ_REMOVE(&m->object->memq, m, listq); /* * And show that the object has one fewer resident page. */ - mem->object->resident_page_count--; + m->object->resident_page_count--; - mem->flags &= ~PG_TABLED; + m->flags &= ~PG_TABLED; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. No side effects. */ vm_page_t vm_page_lookup(object, pindex) register vm_object_t object; register vm_pindex_t pindex; { - register vm_page_t mem; + register vm_page_t m; register struct pglist *bucket; int s; /* * Search the hash table for this object/offset pair */ bucket = &vm_page_buckets[vm_page_hash(object, pindex)]; s = splhigh(); - for (mem = bucket->tqh_first; mem != NULL; mem = mem->hashq.tqe_next) { - if ((mem->object == object) && (mem->pindex == pindex)) { + for (m = bucket->tqh_first; m != NULL; m = m->hashq.tqe_next) { + if ((m->object == object) && (m->pindex == pindex)) { splx(s); - return (mem); + return (m); } } splx(s); return (NULL); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * The object must be locked. */ void -vm_page_rename(mem, new_object, new_pindex) - register vm_page_t mem; +vm_page_rename(m, new_object, new_pindex) + register vm_page_t m; register vm_object_t new_object; vm_pindex_t new_pindex; { int s; s = splhigh(); - vm_page_remove(mem); - vm_page_insert(mem, new_object, new_pindex); + vm_page_remove(m); + vm_page_insert(m, new_object, new_pindex); splx(s); } /* * vm_page_unqueue must be called at splhigh(); */ static inline void -vm_page_unqueue(vm_page_t mem) +vm_page_unqueue(vm_page_t m) { - int origflags; - - origflags = mem->flags; - - if ((origflags & (PG_ACTIVE|PG_INACTIVE|PG_CACHE)) == 0) + int queue = m->queue; + if (queue == PQ_NONE) return; - - if (origflags & PG_ACTIVE) { - TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); - cnt.v_active_count--; - mem->flags &= ~PG_ACTIVE; - } else if (origflags & PG_INACTIVE) { - TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq); - cnt.v_inactive_count--; - mem->flags &= ~PG_INACTIVE; - } else if (origflags & PG_CACHE) { - TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq); - cnt.v_cache_count--; - mem->flags &= ~PG_CACHE; - if (cnt.v_cache_count + cnt.v_free_count < cnt.v_free_reserved) + m->queue = PQ_NONE; + TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq); + --(*vm_page_queues[queue].cnt); + if (queue == PQ_CACHE) { + if ((cnt.v_cache_count + cnt.v_free_count) < + (cnt.v_free_min + cnt.v_cache_min)) pagedaemon_wakeup(); } return; } /* * vm_page_alloc: * * Allocate and return a memory cell associated * with this VM object/offset pair. * * page_req classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request - * or in: * VM_ALLOC_ZERO zero page * * Object must be locked. */ vm_page_t vm_page_alloc(object, pindex, page_req) vm_object_t object; vm_pindex_t pindex; int page_req; { - register vm_page_t mem; + register vm_page_t m; + int queue; int s; #ifdef DIAGNOSTIC - mem = vm_page_lookup(object, pindex); - if (mem) + m = vm_page_lookup(object, pindex); + if (m) panic("vm_page_alloc: page already allocated"); #endif if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) { page_req = VM_ALLOC_SYSTEM; }; s = splhigh(); - switch ((page_req & ~(VM_ALLOC_ZERO))) { + switch (page_req) { + case VM_ALLOC_NORMAL: if (cnt.v_free_count >= cnt.v_free_reserved) { - if (page_req & VM_ALLOC_ZERO) { - mem = vm_page_queue_zero.tqh_first; - if (mem) { - --vm_page_zero_count; - TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq); - mem->flags = PG_BUSY|PG_ZERO; - } else { - mem = vm_page_queue_free.tqh_first; - TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); - mem->flags = PG_BUSY; - } - } else { - mem = vm_page_queue_free.tqh_first; - if (mem) { - TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); - mem->flags = PG_BUSY; - } else { - --vm_page_zero_count; - mem = vm_page_queue_zero.tqh_first; - TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq); - mem->flags = PG_BUSY|PG_ZERO; - } + m = vm_page_queue_free.tqh_first; + if (m == NULL) { + --vm_page_zero_count; + m = vm_page_queue_zero.tqh_first; } - cnt.v_free_count--; } else { - mem = vm_page_queue_cache.tqh_first; - if (mem != NULL) { - TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq); - vm_page_remove(mem); - mem->flags = PG_BUSY; - cnt.v_cache_count--; + m = vm_page_queue_cache.tqh_first; + if (m == NULL) { + splx(s); + pagedaemon_wakeup(); + return (NULL); + } + } + break; + + case VM_ALLOC_ZERO: + if (cnt.v_free_count >= cnt.v_free_reserved) { + m = vm_page_queue_zero.tqh_first; + if (m) { + --vm_page_zero_count; } else { + m = vm_page_queue_free.tqh_first; + } + } else { + m = vm_page_queue_cache.tqh_first; + if (m == NULL) { splx(s); pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_SYSTEM: if ((cnt.v_free_count >= cnt.v_free_reserved) || ((cnt.v_cache_count == 0) && (cnt.v_free_count >= cnt.v_interrupt_free_min))) { - if (page_req & VM_ALLOC_ZERO) { - mem = vm_page_queue_zero.tqh_first; - if (mem) { + m = vm_page_queue_free.tqh_first; + if (m == NULL) { --vm_page_zero_count; - TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq); - mem->flags = PG_BUSY|PG_ZERO; - } else { - mem = vm_page_queue_free.tqh_first; - TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); - mem->flags = PG_BUSY; + m = vm_page_queue_zero.tqh_first; } - } else { - mem = vm_page_queue_free.tqh_first; - if (mem) { - TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); - mem->flags = PG_BUSY; - } else { - --vm_page_zero_count; - mem = vm_page_queue_zero.tqh_first; - TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq); - mem->flags = PG_BUSY|PG_ZERO; - } - } - cnt.v_free_count--; } else { - mem = vm_page_queue_cache.tqh_first; - if (mem != NULL) { - TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq); - vm_page_remove(mem); - mem->flags = PG_BUSY; - cnt.v_cache_count--; - } else { + m = vm_page_queue_cache.tqh_first; + if (m == NULL) { splx(s); pagedaemon_wakeup(); return (NULL); } } break; case VM_ALLOC_INTERRUPT: if (cnt.v_free_count > 0) { - mem = vm_page_queue_free.tqh_first; - if (mem) { - TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); - mem->flags = PG_BUSY; - } else { + m = vm_page_queue_free.tqh_first; + if (m == NULL) { --vm_page_zero_count; - mem = vm_page_queue_zero.tqh_first; - TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq); - mem->flags = PG_BUSY|PG_ZERO; + m = vm_page_queue_zero.tqh_first; } - cnt.v_free_count--; } else { splx(s); pagedaemon_wakeup(); - return NULL; + return (NULL); } break; default: panic("vm_page_alloc: invalid allocation class"); } - mem->wire_count = 0; - mem->hold_count = 0; - mem->act_count = 0; - mem->busy = 0; - mem->valid = 0; - mem->dirty = 0; - mem->bmapped = 0; + queue = m->queue; + TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq); + --(*vm_page_queues[queue].cnt); + if (queue == PQ_ZERO) { + m->flags = PG_ZERO|PG_BUSY; + } else if (queue == PQ_CACHE) { + vm_page_remove(m); + m->flags = PG_BUSY; + } else { + m->flags = PG_BUSY; + } + m->wire_count = 0; + m->hold_count = 0; + m->act_count = 0; + m->busy = 0; + m->valid = 0; + m->dirty = 0; + m->queue = PQ_NONE; /* XXX before splx until vm_page_insert is safe */ - vm_page_insert(mem, object, pindex); + vm_page_insert(m, object, pindex); splx(s); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ - if (((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) || - (cnt.v_free_count < cnt.v_pageout_free_min)) + if (((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min)) || + (cnt.v_free_count < cnt.v_pageout_free_min)) pagedaemon_wakeup(); - return (mem); + return (m); } vm_offset_t vm_page_alloc_contig(size, low, high, alignment) vm_offset_t size; vm_offset_t low; vm_offset_t high; vm_offset_t alignment; { int i, s, start; vm_offset_t addr, phys, tmp_addr; vm_page_t pga = vm_page_array; if ((alignment & (alignment - 1)) != 0) panic("vm_page_alloc_contig: alignment must be a power of 2"); start = 0; s = splhigh(); again: /* * Find first page in array that is free, within range, and aligned. */ for (i = start; i < cnt.v_page_count; i++) { phys = VM_PAGE_TO_PHYS(&pga[i]); - if (((pga[i].flags & PG_FREE) == PG_FREE) && + if ((pga[i].queue == PQ_FREE) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0)) break; } /* * If the above failed or we will exceed the upper bound, fail. */ - if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { + if ((i == cnt.v_page_count) || + ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { splx(s); return (NULL); } start = i; /* * Check successive pages for contiguous and free. */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { if ((VM_PAGE_TO_PHYS(&pga[i]) != - (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || - ((pga[i].flags & PG_FREE) != PG_FREE)) { + (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || + (pga[i].queue != PQ_FREE)) { start++; goto again; } } /* * We've found a contiguous chunk that meets are requirements. * Allocate kernel VM, unfree and assign the physical pages to it and * return kernel VM pointer. */ tmp_addr = addr = kmem_alloc_pageable(kernel_map, size); for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; TAILQ_REMOVE(&vm_page_queue_free, m, pageq); cnt.v_free_count--; m->valid = VM_PAGE_BITS_ALL; m->flags = 0; m->dirty = 0; m->wire_count = 0; m->act_count = 0; - m->bmapped = 0; m->busy = 0; + m->queue = PQ_NONE; vm_page_insert(m, kernel_object, OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); vm_page_wire(m); pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m)); tmp_addr += PAGE_SIZE; } splx(s); return (addr); } /* * vm_page_free: * * Returns the given page to the free list, * disassociating it with any VM object. * * Object and page must be locked prior to entry. */ void -vm_page_free(mem) - register vm_page_t mem; +vm_page_free(m) + register vm_page_t m; { int s; - int flags; + int flags = m->flags; s = splhigh(); - vm_page_remove(mem); - vm_page_unqueue(mem); - - flags = mem->flags; - if (mem->bmapped || mem->busy || flags & (PG_BUSY|PG_FREE)) { - if (flags & PG_FREE) + if (m->busy || (flags & PG_BUSY) || (m->queue == PQ_FREE)) { + printf("vm_page_free: pindex(%ld), busy(%d), PG_BUSY(%d)\n", + m->pindex, m->busy, (flags & PG_BUSY) ? 1 : 0); + if (m->queue == PQ_FREE) panic("vm_page_free: freeing free page"); - printf("vm_page_free: pindex(%ld), bmapped(%d), busy(%d), PG_BUSY(%d)\n", - mem->pindex, mem->bmapped, mem->busy, (flags & PG_BUSY) ? 1 : 0); - panic("vm_page_free: freeing busy page"); + else + panic("vm_page_free: freeing busy page"); } + vm_page_remove(m); + vm_page_unqueue(m); + +/* if ((flags & PG_WANTED) != 0) - wakeup(mem); + wakeup(m); +*/ if ((flags & PG_FICTITIOUS) == 0) { - if (mem->wire_count) { - if (mem->wire_count > 1) { - printf("vm_page_free: wire count > 1 (%d)", mem->wire_count); + if (m->wire_count) { + if (m->wire_count > 1) { + printf("vm_page_free: wire count > 1 (%d)", m->wire_count); panic("vm_page_free: invalid wire count"); } cnt.v_wire_count--; - mem->wire_count = 0; + m->wire_count = 0; } - mem->flags |= PG_FREE; - TAILQ_INSERT_TAIL(&vm_page_queue_free, mem, pageq); + m->queue = PQ_FREE; + TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); splx(s); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } cnt.v_free_count++; /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if ((cnt.v_free_count + cnt.v_cache_count) == cnt.v_free_min) { wakeup(&cnt.v_free_count); wakeup(&proc0); } } else { splx(s); } cnt.v_tfree++; } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * The page queues must be locked. */ void -vm_page_wire(mem) - register vm_page_t mem; +vm_page_wire(m) + register vm_page_t m; { int s; - if (mem->wire_count == 0) { + if (m->wire_count == 0) { s = splhigh(); - vm_page_unqueue(mem); + vm_page_unqueue(m); splx(s); cnt.v_wire_count++; } - mem->flags |= PG_WRITEABLE|PG_MAPPED; - mem->wire_count++; + m->wire_count++; + m->flags |= PG_MAPPED; } /* * vm_page_unwire: * * Release one wiring of this page, potentially * enabling it to be paged again. * * The page queues must be locked. */ void -vm_page_unwire(mem) - register vm_page_t mem; +vm_page_unwire(m) + register vm_page_t m; { int s; s = splhigh(); - if (mem->wire_count) - mem->wire_count--; - if (mem->wire_count == 0) { - TAILQ_INSERT_TAIL(&vm_page_queue_active, mem, pageq); - cnt.v_active_count++; - mem->flags |= PG_ACTIVE; + if (m->wire_count > 0) + m->wire_count--; + + if (m->wire_count == 0) { cnt.v_wire_count--; + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + m->queue = PQ_ACTIVE; + if( m->act_count < ACT_MAX) + m->act_count += 1; + cnt.v_active_count++; } splx(s); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * * The page queues must be locked. */ void vm_page_activate(m) register vm_page_t m; { int s; s = splhigh(); - if (m->flags & PG_ACTIVE) + if (m->queue == PQ_ACTIVE) panic("vm_page_activate: already active"); - if (m->flags & PG_CACHE) + if (m->queue == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); if (m->wire_count == 0) { TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); - m->flags |= PG_ACTIVE; + m->queue = PQ_ACTIVE; if (m->act_count < 5) m->act_count = 5; else if( m->act_count < ACT_MAX) m->act_count += 1; cnt.v_active_count++; } splx(s); } /* * vm_page_deactivate: * * Returns the given page to the inactive list, * indicating that no physical maps have access * to this page. [Used by the physical mapping system.] * * The page queues must be locked. */ void vm_page_deactivate(m) register vm_page_t m; { int spl; /* * Only move active pages -- ignore locked or already inactive ones. * * XXX: sometimes we get pages which aren't wired down or on any queue - * we need to put them on the inactive queue also, otherwise we lose * track of them. Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93. */ + if (m->queue == PQ_INACTIVE) + return; spl = splhigh(); - if (!(m->flags & PG_INACTIVE) && m->wire_count == 0 && - m->hold_count == 0) { - if (m->flags & PG_CACHE) + if (m->wire_count == 0 && m->hold_count == 0) { + if (m->queue == PQ_CACHE) cnt.v_reactivated++; vm_page_unqueue(m); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); - m->flags |= PG_INACTIVE; + m->queue = PQ_INACTIVE; cnt.v_inactive_count++; m->act_count = 0; } splx(spl); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). */ void vm_page_cache(m) register vm_page_t m; { int s; - if ((m->flags & (PG_CACHE | PG_BUSY)) || m->busy || m->wire_count || - m->bmapped) + if ((m->flags & PG_BUSY) || m->busy || m->wire_count) return; + if (m->queue == PQ_CACHE) + return; + vm_page_protect(m, VM_PROT_NONE); s = splhigh(); vm_page_unqueue(m); - vm_page_protect(m, VM_PROT_NONE); - TAILQ_INSERT_TAIL(&vm_page_queue_cache, m, pageq); - m->flags |= PG_CACHE; + m->queue = PQ_CACHE; cnt.v_cache_count++; if ((cnt.v_free_count + cnt.v_cache_count) == cnt.v_free_min) { wakeup(&cnt.v_free_count); wakeup(&proc0); } if (vm_pageout_pages_needed) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } - splx(s); } /* * vm_page_zero_fill: * * Zero-fill the specified page. * Written as a standard pagein routine, to * be used by the zero-fill object. */ boolean_t vm_page_zero_fill(m) vm_page_t m; { pmap_zero_page(VM_PAGE_TO_PHYS(m)); return (TRUE); } /* * vm_page_copy: * * Copy one page to another */ void vm_page_copy(src_m, dest_m) vm_page_t src_m; vm_page_t dest_m; { pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); dest_m->valid = VM_PAGE_BITS_ALL; } /* * mapping function for valid bits or for dirty bits in * a page */ inline int vm_page_bits(int base, int size) { u_short chunk; if ((base == 0) && (size >= PAGE_SIZE)) return VM_PAGE_BITS_ALL; size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); base = (base % PAGE_SIZE) / DEV_BSIZE; chunk = vm_page_dev_bsize_chunks[size / DEV_BSIZE]; return (chunk << base) & VM_PAGE_BITS_ALL; } /* * set a page valid and clean */ void vm_page_set_validclean(m, base, size) vm_page_t m; int base; int size; { int pagebits = vm_page_bits(base, size); m->valid |= pagebits; m->dirty &= ~pagebits; if( base == 0 && size == PAGE_SIZE) pmap_clear_modify(VM_PAGE_TO_PHYS(m)); } /* * set a page (partially) invalid */ void vm_page_set_invalid(m, base, size) vm_page_t m; int base; int size; { int bits; m->valid &= ~(bits = vm_page_bits(base, size)); if (m->valid == 0) m->dirty &= ~bits; } /* * is (partial) page valid? */ int vm_page_is_valid(m, base, size) vm_page_t m; int base; int size; { int bits = vm_page_bits(base, size); if (m->valid && ((m->valid & bits) == bits)) return 1; else return 0; } void vm_page_test_dirty(m) vm_page_t m; { if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(VM_PAGE_TO_PHYS(m))) { m->dirty = VM_PAGE_BITS_ALL; } } #ifdef DDB void DDB_print_page_info(void) { printf("cnt.v_free_count: %d\n", cnt.v_free_count); printf("cnt.v_cache_count: %d\n", cnt.v_cache_count); printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count); printf("cnt.v_active_count: %d\n", cnt.v_active_count); printf("cnt.v_wire_count: %d\n", cnt.v_wire_count); printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved); printf("cnt.v_free_min: %d\n", cnt.v_free_min); printf("cnt.v_free_target: %d\n", cnt.v_free_target); printf("cnt.v_cache_min: %d\n", cnt.v_cache_min); printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target); } #endif Index: head/sys/vm/vm_page.h =================================================================== --- head/sys/vm/vm_page.h (revision 13489) +++ head/sys/vm/vm_page.h (revision 13490) @@ -1,298 +1,303 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.h 8.2 (Berkeley) 12/13/93 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_page.h,v 1.22 1995/11/20 12:19:32 phk Exp $ + * $Id: vm_page.h,v 1.23 1995/12/11 04:58:26 dyson Exp $ */ /* * Resident memory system definitions. */ #ifndef _VM_PAGE_ #define _VM_PAGE_ #include /* * Management of resident (logical) pages. * * A small structure is kept for each resident * page, indexed by page number. Each structure * is an element of several lists: * * A hash table bucket used to quickly * perform object/offset lookups * * A list of all pages for a given object, * so they can be quickly deactivated at * time of deallocation. * * An ordered list of pages due for pageout. * * In addition, the structure contains the object * and offset to which this page belongs (for pageout), * and sundry status bits. * * Fields in this structure are locked either by the lock on the * object that the page belongs to (O) or by the lock on the page * queues (P). */ TAILQ_HEAD(pglist, vm_page); struct vm_page { TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO queue or free list (P) */ TAILQ_ENTRY(vm_page) hashq; /* hash table links (O) */ TAILQ_ENTRY(vm_page) listq; /* pages in same object (O) */ vm_object_t object; /* which object am I in (O,P) */ vm_pindex_t pindex; /* offset into object (O,P) */ vm_offset_t phys_addr; /* physical address of page */ - + u_short queue:4, /* page queue index */ + flags:12; /* see below */ u_short wire_count; /* wired down maps refs (P) */ - u_short flags; /* see below */ short hold_count; /* page hold count */ - u_short act_count; /* page usage count */ - u_short bmapped; /* number of buffers mapped */ - u_short busy; /* page busy count */ - u_short valid; /* map of valid DEV_BSIZE chunks */ - u_short dirty; /* map of dirty DEV_BSIZE chunks */ + u_char act_count; /* page usage count */ + u_char busy; /* page busy count */ + /* NOTE that these must support one bit per DEV_BSIZE in a page!!! */ + /* so, on normal X86 kernels, they must be at least 8 bits wide */ + u_char valid; /* map of valid DEV_BSIZE chunks */ + u_char dirty; /* map of dirty DEV_BSIZE chunks */ }; +#define PQ_NONE 0 +#define PQ_FREE 1 +#define PQ_ZERO 2 +#define PQ_INACTIVE 3 +#define PQ_ACTIVE 4 +#define PQ_CACHE 5 + /* * These are the flags defined for vm_page. * * Note: PG_FILLED and PG_DIRTY are added for the filesystems. */ -#define PG_INACTIVE 0x0001 /* page is in inactive list (P) */ -#define PG_ACTIVE 0x0002 /* page is in active list (P) */ -#define PG_BUSY 0x0010 /* page is in transit (O) */ -#define PG_WANTED 0x0020 /* someone is waiting for page (O) */ -#define PG_TABLED 0x0040 /* page is in VP table (O) */ -#define PG_FICTITIOUS 0x0100 /* physical page doesn't exist (O) */ -#define PG_WRITEABLE 0x0200 /* page is mapped writeable */ -#define PG_MAPPED 0x0400 /* page is mapped */ -#define PG_ZERO 0x0800 /* page is zeroed */ -#define PG_REFERENCED 0x1000 /* page has been referenced */ -#define PG_CACHE 0x4000 /* On VMIO cache */ -#define PG_FREE 0x8000 /* page is in free list */ +#define PG_BUSY 0x01 /* page is in transit (O) */ +#define PG_WANTED 0x02 /* someone is waiting for page (O) */ +#define PG_TABLED 0x04 /* page is in VP table (O) */ +#define PG_FICTITIOUS 0x08 /* physical page doesn't exist (O) */ +#define PG_WRITEABLE 0x10 /* page is mapped writeable */ +#define PG_MAPPED 0x20 /* page is mapped */ +#define PG_ZERO 0x40 /* page is zeroed */ +#define PG_REFERENCED 0x80 /* page has been referenced */ +#define PG_CLEANCHK 0x100 /* page has been checked for cleaning */ /* * Misc constants. */ #define ACT_DECLINE 1 #define ACT_ADVANCE 3 #define ACT_MAX 100 #define PFCLUSTER_BEHIND 3 #define PFCLUSTER_AHEAD 3 #ifdef KERNEL /* * Each pageable resident page falls into one of four lists: * * free * Available for allocation now. * * The following are all LRU sorted: * * cache * Almost available for allocation. Still in an * object, but clean and immediately freeable at * non-interrupt times. * * inactive * Low activity, candidates for reclaimation. * This is the list of pages that should be * paged out next. * * active * Pages that are "active" i.e. they have been * recently referenced. * * zero * Pages that are really free and have been pre-zeroed * */ extern struct pglist vm_page_queue_free; /* memory free queue */ extern struct pglist vm_page_queue_zero; /* zeroed memory free queue */ extern struct pglist vm_page_queue_active; /* active memory queue */ extern struct pglist vm_page_queue_inactive; /* inactive memory queue */ extern struct pglist vm_page_queue_cache; /* cache memory queue */ extern int vm_page_zero_count; extern vm_page_t vm_page_array; /* First resident page in table */ extern long first_page; /* first physical page number */ /* ... represented in vm_page_array */ extern long last_page; /* last physical page number */ /* ... represented in vm_page_array */ /* [INCLUSIVE] */ extern vm_offset_t first_phys_addr; /* physical address for first_page */ extern vm_offset_t last_phys_addr; /* physical address for last_page */ #define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) #define IS_VM_PHYSADDR(pa) \ ((pa) >= first_phys_addr && (pa) <= last_phys_addr) #define PHYS_TO_VM_PAGE(pa) \ (&vm_page_array[atop(pa) - first_page ]) /* * Functions implemented as macros */ #define PAGE_ASSERT_WAIT(m, interruptible) { \ (m)->flags |= PG_WANTED; \ assert_wait((int) (m), (interruptible)); \ } #define PAGE_WAKEUP(m) { \ (m)->flags &= ~PG_BUSY; \ if ((m)->flags & PG_WANTED) { \ (m)->flags &= ~PG_WANTED; \ wakeup((caddr_t) (m)); \ } \ } #if PAGE_SIZE == 4096 #define VM_PAGE_BITS_ALL 0xff #endif #if PAGE_SIZE == 8192 #define VM_PAGE_BITS_ALL 0xffff #endif #define VM_ALLOC_NORMAL 0 #define VM_ALLOC_INTERRUPT 1 #define VM_ALLOC_SYSTEM 2 -#define VM_ALLOC_ZERO 0x80 +#define VM_ALLOC_ZERO 3 void vm_page_activate __P((vm_page_t)); vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); void vm_page_copy __P((vm_page_t, vm_page_t)); void vm_page_deactivate __P((vm_page_t)); void vm_page_free __P((vm_page_t)); void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t)); void vm_page_remove __P((vm_page_t)); void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t)); vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t)); void vm_page_unwire __P((vm_page_t)); void vm_page_wire __P((vm_page_t)); boolean_t vm_page_zero_fill __P((vm_page_t)); void vm_page_set_validclean __P((vm_page_t, int, int)); void vm_page_set_invalid __P((vm_page_t, int, int)); int vm_page_is_valid __P((vm_page_t, int, int)); void vm_page_test_dirty __P((vm_page_t)); int vm_page_bits __P((int, int)); /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ static __inline void vm_page_hold(vm_page_t mem) { mem->hold_count++; } #ifdef DIAGNOSTIC #include /* make GCC shut up */ #endif static __inline void vm_page_unhold(vm_page_t mem) { #ifdef DIAGNOSTIC if (--mem->hold_count < 0) panic("vm_page_unhold: hold count < 0!!!"); #else --mem->hold_count; #endif } static __inline void vm_page_protect(vm_page_t mem, int prot) { if (prot == VM_PROT_NONE) { if (mem->flags & (PG_WRITEABLE|PG_MAPPED)) { pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot); mem->flags &= ~(PG_WRITEABLE|PG_MAPPED); } } else if ((prot == VM_PROT_READ) && (mem->flags & PG_WRITEABLE)) { pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot); mem->flags &= ~PG_WRITEABLE; } } #endif /* KERNEL */ #endif /* !_VM_PAGE_ */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 13489) +++ head/sys/vm/vm_pageout.c (revision 13490) @@ -1,989 +1,984 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.62 1995/12/11 04:58:28 dyson Exp $ + * $Id: vm_pageout.c,v 1.63 1995/12/14 09:55:09 phk Exp $ */ /* * The proverbial page-out daemon. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout __P((void)); static int vm_pageout_clean __P((vm_page_t, int)); static int vm_pageout_scan __P((void)); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT_KT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) /* the kernel process "vm_daemon"*/ static void vm_daemon __P((void)); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT_KT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ extern int npendingio; static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; extern int nswiodone; extern int vm_swap_size; extern int vfs_update_wakeup; #define MAXSCAN 1024 /* maximum number of pages to scan in queues */ #define MAXLAUNDER (cnt.v_page_count > 1800 ? 32 : 16) #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ typedef int freeer_fcn_t __P((vm_map_t, vm_object_t, int, int)); static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_map_entry_t, int *, freeer_fcn_t *)); static freeer_fcn_t vm_pageout_object_deactivate_pages; static void vm_req_vmdaemon __P((void)); /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. * * And we set pageout-in-progress to keep the object from disappearing * during pageout. This guarantees that the page won't move from the * inactive queue. (However, any other page on the inactive queue may * move!) */ static int vm_pageout_clean(m, sync) vm_page_t m; int sync; { register vm_object_t object; vm_page_t mc[2*VM_PAGEOUT_PAGE_COUNT]; int pageout_count; int i, forward_okay, backward_okay, page_base; vm_pindex_t pindex = m->pindex; object = m->object; /* * If not OBJT_SWAP, additional memory may be needed to do the pageout. * Try to avoid the deadlock. */ if ((sync != VM_PAGEOUT_FORCE) && (object->type != OBJT_SWAP) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)) return 0; /* * Don't mess with the page if it's busy. */ if ((!sync && m->hold_count != 0) || ((m->busy != 0) || (m->flags & PG_BUSY))) return 0; /* * Try collapsing before it's too late. */ if (!sync && object->backing_object) { vm_object_collapse(object); } mc[VM_PAGEOUT_PAGE_COUNT] = m; pageout_count = 1; page_base = VM_PAGEOUT_PAGE_COUNT; forward_okay = TRUE; if (pindex != 0) backward_okay = TRUE; else backward_okay = FALSE; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. */ for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) { vm_page_t p; /* * See if forward page is clusterable. */ if (forward_okay) { /* * Stop forward scan at end of object. */ if ((pindex + i) > object->size) { forward_okay = FALSE; goto do_backward; } p = vm_page_lookup(object, pindex + i); if (p) { - if ((p->flags & (PG_BUSY|PG_CACHE)) || p->busy) { + if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { forward_okay = FALSE; goto do_backward; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && - ((p->flags & PG_INACTIVE) || + ((p->queue == PQ_INACTIVE) || (sync == VM_PAGEOUT_FORCE)) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[VM_PAGEOUT_PAGE_COUNT + i] = p; pageout_count++; if (pageout_count == vm_pageout_page_count) break; } else { forward_okay = FALSE; } } else { forward_okay = FALSE; } } do_backward: /* * See if backward page is clusterable. */ if (backward_okay) { /* * Stop backward scan at beginning of object. */ if ((pindex - i) == 0) { backward_okay = FALSE; } p = vm_page_lookup(object, pindex - i); if (p) { - if ((p->flags & (PG_BUSY|PG_CACHE)) || p->busy) { + if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { backward_okay = FALSE; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && - ((p->flags & PG_INACTIVE) || + ((p->queue == PQ_INACTIVE) || (sync == VM_PAGEOUT_FORCE)) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[VM_PAGEOUT_PAGE_COUNT - i] = p; pageout_count++; page_base--; if (pageout_count == vm_pageout_page_count) break; } else { backward_okay = FALSE; } } else { backward_okay = FALSE; } } } /* * we allow reads during pageouts... */ for (i = page_base; i < (page_base + pageout_count); i++) { mc[i]->flags |= PG_BUSY; vm_page_protect(mc[i], VM_PROT_READ); } return vm_pageout_flush(&mc[page_base], pageout_count, sync); } int vm_pageout_flush(mc, count, sync) vm_page_t *mc; int count; int sync; { register vm_object_t object; int pageout_status[count]; int anyok = 0; int i; object = mc[0]->object; object->paging_in_progress += count; vm_pager_put_pages(object, mc, count, ((sync || (object == kernel_object)) ? TRUE : FALSE), pageout_status); for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; switch (pageout_status[i]) { case VM_PAGER_OK: ++anyok; break; case VM_PAGER_PEND: ++anyok; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); mt->dirty = 0; break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ - if (mt->flags & PG_INACTIVE) + if (mt->queue == PQ_INACTIVE) vm_page_activate(mt); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); - if ((mt->flags & (PG_REFERENCED|PG_WANTED)) || - pmap_is_referenced(VM_PAGE_TO_PHYS(mt))) { - pmap_clear_reference(VM_PAGE_TO_PHYS(mt)); - mt->flags &= ~PG_REFERENCED; - if (mt->flags & PG_INACTIVE) - vm_page_activate(mt); - } PAGE_WAKEUP(mt); } } return anyok; } /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * backing_objects. * * The object and map must be locked. */ static int vm_pageout_object_deactivate_pages(map, object, count, map_remove_only) vm_map_t map; vm_object_t object; int count; int map_remove_only; { register vm_page_t p, next; int rcount; int dcount; dcount = 0; if (count == 0) count = 1; if (object->type == OBJT_DEVICE) return 0; if (object->backing_object) { if (object->backing_object->ref_count == 1) dcount += vm_pageout_object_deactivate_pages(map, object->backing_object, count / 2 + 1, map_remove_only); else vm_pageout_object_deactivate_pages(map, object->backing_object, count, 1); } if (object->paging_in_progress) return dcount; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = object->memq.tqh_first; while (p && (rcount-- > 0)) { next = p->listq.tqe_next; cnt.v_pdpages++; if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || + (p->flags & PG_BUSY) || !pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { p = next; continue; } /* * if a page is active, not wired and is in the processes * pmap, then deactivate the page. */ - if ((p->flags & (PG_ACTIVE | PG_BUSY)) == PG_ACTIVE) { + if (p->queue == PQ_ACTIVE) { if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p)) && - (p->flags & (PG_REFERENCED|PG_WANTED)) == 0) { + (p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we * deactivate */ if (!p->act_count) { if (!map_remove_only) vm_page_deactivate(p); vm_page_protect(p, VM_PROT_NONE); /* * else if on the next go-around we * will deactivate the page we need to * place the page on the end of the * queue to age the other pages in * memory. */ } else { TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); } /* * see if we are done yet */ - if (p->flags & PG_INACTIVE) { + if (p->queue == PQ_INACTIVE) { --count; ++dcount; if (count <= 0 && cnt.v_inactive_count > cnt.v_inactive_target) { return dcount; } } } else { /* * Move the page to the bottom of the queue. */ pmap_clear_reference(VM_PAGE_TO_PHYS(p)); p->flags &= ~PG_REFERENCED; if (p->act_count < ACT_MAX) p->act_count += ACT_ADVANCE; TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); } - } else if ((p->flags & (PG_INACTIVE | PG_BUSY)) == PG_INACTIVE) { + } else if (p->queue == PQ_INACTIVE) { vm_page_protect(p, VM_PROT_NONE); } p = next; } return dcount; } - /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, entry, count, freeer) vm_map_t map; vm_map_entry_t entry; int *count; freeer_fcn_t *freeer; { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (*count <= 0) return; vm_map_reference(map); if (!lock_try_read(&map->lock)) { vm_map_deallocate(map); return; } if (entry == 0) { tmpe = map->header.next; while (tmpe != &map->header && *count > 0) { vm_pageout_map_deactivate_pages(map, tmpe, count, freeer); tmpe = tmpe->next; }; } else if (entry->is_sub_map || entry->is_a_map) { tmpm = entry->object.share_map; tmpe = tmpm->header.next; while (tmpe != &tmpm->header && *count > 0) { vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer); tmpe = tmpe->next; }; } else if ((obj = entry->object.vm_object) != 0) { *count -= (*freeer) (map, obj, *count, TRUE); } lock_read_done(&map->lock); vm_map_deallocate(map); return; } static void vm_req_vmdaemon() { static int lastrun = 0; if ((ticks > (lastrun + hz / 10)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ static int vm_pageout_scan() { vm_page_t m; int page_shortage, maxscan, maxlaunder, pcount; int pages_freed; vm_page_t next; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; int force_wakeup = 0; int vnodes_skipped = 0; pages_freed = 0; /* * Start scanning the inactive queue for pages we can free. We keep * scanning until we have enough free pages or we have scanned through * the entire queue. If we encounter dirty pages, we start cleaning * them. */ maxlaunder = (cnt.v_inactive_target > MAXLAUNDER) ? MAXLAUNDER : cnt.v_inactive_target; rescan1: maxscan = cnt.v_inactive_count; m = vm_page_queue_inactive.tqh_first; while ((m != NULL) && (maxscan-- > 0) && ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_cache_min + cnt.v_free_target))) { vm_page_t next; cnt.v_pdpages++; next = m->pageq.tqe_next; #if defined(VM_DIAGNOSE) - if ((m->flags & PG_INACTIVE) == 0) { + if (m->queue != PQ_INACTIVE) { printf("vm_pageout_scan: page not inactive?\n"); break; } #endif /* * dont mess with busy pages */ - if (m->hold_count || m->busy || (m->flags & PG_BUSY)) { + if (m->busy || (m->flags & PG_BUSY)) { + m = next; + continue; + } + if (m->hold_count) { TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); m = next; continue; } + if (((m->flags & PG_REFERENCED) == 0) && pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { m->flags |= PG_REFERENCED; } if (m->object->ref_count == 0) { m->flags &= ~PG_REFERENCED; pmap_clear_reference(VM_PAGE_TO_PHYS(m)); } - if ((m->flags & (PG_REFERENCED|PG_WANTED)) != 0) { + if ((m->flags & PG_REFERENCED) != 0) { m->flags &= ~PG_REFERENCED; pmap_clear_reference(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); if (m->act_count < ACT_MAX) m->act_count += ACT_ADVANCE; m = next; continue; } - vm_page_test_dirty(m); if (m->dirty == 0) { - if (m->bmapped == 0) { - if (m->valid == 0) { - pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); - vm_page_free(m); - cnt.v_dfree++; - } else { - vm_page_cache(m); - } - ++pages_freed; - } else { - m = next; - continue; - } + vm_page_test_dirty(m); + } else if (m->dirty != 0) + m->dirty = VM_PAGE_BITS_ALL; + if (m->valid == 0) { + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + cnt.v_dfree++; + ++pages_freed; + } else if (m->dirty == 0) { + vm_page_cache(m); + ++pages_freed; } else if (maxlaunder > 0) { int written; struct vnode *vp = NULL; object = m->object; if (object->flags & OBJ_DEAD) { m = next; continue; } if (object->type == OBJT_VNODE) { vp = object->handle; if (VOP_ISLOCKED(vp) || vget(vp, 1)) { if (object->flags & OBJ_MIGHTBEDIRTY) ++vnodes_skipped; m = next; continue; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ written = vm_pageout_clean(m, 0); if (vp) vput(vp); if (!next) { break; } maxlaunder -= written; /* * if the next page has been re-activated, start * scanning again */ - if ((next->flags & PG_INACTIVE) == 0) { + if (next->queue != PQ_INACTIVE) { vm_pager_sync(); goto rescan1; } } m = next; } /* * Compute the page shortage. If we are still very low on memory be * sure that we will move a minimal amount of pages from active to * inactive. */ page_shortage = cnt.v_inactive_target - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); if (page_shortage <= 0) { if (pages_freed == 0) { page_shortage = cnt.v_free_min - cnt.v_free_count; } else { page_shortage = 1; } } maxscan = MAXSCAN; pcount = cnt.v_active_count; m = vm_page_queue_active.tqh_first; - while ((m != NULL) && (maxscan > 0) && (pcount-- > 0) && (page_shortage > 0)) { + while ((m != NULL) && (maxscan > 0) && + (pcount-- > 0) && (page_shortage > 0)) { cnt.v_pdpages++; next = m->pageq.tqe_next; /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); m = next; - /* printf("busy: s: %d, f: 0x%x, h: %d\n", - m->busy, m->flags, m->hold_count); */ continue; } if (m->object->ref_count && - ((m->flags & (PG_REFERENCED|PG_WANTED)) || - pmap_is_referenced(VM_PAGE_TO_PHYS(m)))) { + ((m->flags & PG_REFERENCED) || + pmap_is_referenced(VM_PAGE_TO_PHYS(m))) ) { pmap_clear_reference(VM_PAGE_TO_PHYS(m)); m->flags &= ~PG_REFERENCED; if (m->act_count < ACT_MAX) { m->act_count += ACT_ADVANCE; } TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); } else { m->flags &= ~PG_REFERENCED; pmap_clear_reference(VM_PAGE_TO_PHYS(m)); m->act_count -= min(m->act_count, ACT_DECLINE); /* * if the page act_count is zero -- then we deactivate */ if (!m->act_count && (page_shortage > 0)) { if (m->object->ref_count == 0) { --page_shortage; vm_page_test_dirty(m); - if ((m->bmapped == 0) && (m->dirty == 0) ) { + if (m->dirty == 0) { m->act_count = 0; vm_page_cache(m); } else { vm_page_deactivate(m); } } else { vm_page_deactivate(m); --page_shortage; } } else if (m->act_count) { TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); } } maxscan--; m = next; } /* * We try to maintain some *really* free pages, this allows interrupt * code to be guaranteed space. */ while (cnt.v_free_count < cnt.v_free_reserved) { m = vm_page_queue_cache.tqh_first; if (!m) break; vm_page_free(m); cnt.v_dfree++; } /* * If we didn't get enough free pages, and we have skipped a vnode * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ - if ((cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_target) { + if ((cnt.v_cache_count + cnt.v_free_count) < + (cnt.v_free_target + cnt.v_cache_min) ) { if (vnodes_skipped && (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) { if (!vfs_update_wakeup) { vfs_update_wakeup = 1; wakeup(&vfs_update_wakeup); } } /* * now swap processes out if we are in low memory conditions */ if (!swap_pager_full && vm_swap_size && vm_pageout_req_swapout == 0) { vm_pageout_req_swapout = 1; vm_req_vmdaemon(); } } if ((cnt.v_inactive_count + cnt.v_free_count + cnt.v_cache_count) < (cnt.v_inactive_target + cnt.v_free_min)) { vm_req_vmdaemon(); } /* * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. */ if ((vm_swap_size == 0 || swap_pager_full) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) { bigproc = NULL; bigsize = 0; for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { /* * if this is a system process, skip it */ if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || ((p->p_pid < 48) && (vm_swap_size != 0))) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get the process size */ size = p->p_vmspace->vm_pmap.pm_stats.resident_count; /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { bigproc = p; bigsize = size; } } if (bigproc != NULL) { printf("Process %lu killed by vm_pageout -- out of swap\n", (u_long) bigproc->p_pid); psignal(bigproc, SIGKILL); bigproc->p_estcpu = 0; bigproc->p_nice = PRIO_MIN; resetpriority(bigproc); wakeup(&cnt.v_free_count); } } return force_wakeup; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout() { (void) spl0(); /* * Initialize some paging parameters. */ cnt.v_interrupt_free_min = 2; if (cnt.v_page_count > 1024) cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; else cnt.v_free_min = 4; /* * free_reserved needs to include enough for the largest swap pager * structures plus enough for any pv_entry structs when paging. */ cnt.v_pageout_free_min = 6 + cnt.v_page_count / 1024 + cnt.v_interrupt_free_min; cnt.v_free_reserved = cnt.v_pageout_free_min + 6; cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved; cnt.v_free_min += cnt.v_free_reserved; if (cnt.v_page_count > 1024) { cnt.v_cache_max = (cnt.v_free_count - 1024) / 2; cnt.v_cache_min = (cnt.v_free_count - 1024) / 8; cnt.v_inactive_target = 2*cnt.v_cache_min + 192; } else { cnt.v_cache_min = 0; cnt.v_cache_max = 0; cnt.v_inactive_target = cnt.v_free_count / 4; } /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; swap_pager_swap_init(); /* * The pageout daemon is never done, so loop forever. */ while (TRUE) { int s = splhigh(); if (!vm_pages_needed || ((cnt.v_free_count >= cnt.v_free_reserved) && (cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min))) { vm_pages_needed = 0; tsleep(&vm_pages_needed, PVM, "psleep", 0); } vm_pages_needed = 0; splx(s); cnt.v_pdwakeups++; vm_pager_sync(); vm_pageout_scan(); vm_pager_sync(); wakeup(&cnt.v_free_count); wakeup(kmem_map); } } static void vm_daemon() { vm_object_t object; struct proc *p; while (TRUE) { tsleep(&vm_daemon_needed, PUSER, "psleep", 0); if (vm_pageout_req_swapout) { swapout_procs(); vm_pageout_req_swapout = 0; } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ for (p = (struct proc *) allproc; p != NULL; p = p->p_next) { int overage; quad_t limit; vm_offset_t size; /* * if this is a system process or if we have already * looked at this process, skip it. */ if (p->p_flag & (P_SYSTEM | P_WEXIT)) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get a limit */ limit = qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ size = p->p_vmspace->vm_pmap.pm_stats.resident_count * PAGE_SIZE; if (limit >= 0 && size >= limit) { overage = (size - limit) >> PAGE_SHIFT; vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages); } } /* * we remove cached objects that have no RSS... */ restart: object = vm_object_cached_list.tqh_first; while (object) { /* * if there are no resident pages -- get rid of the object */ if (object->resident_page_count == 0) { vm_object_reference(object); pager_cache(object, FALSE); goto restart; } object = object->cached_list.tqe_next; } } } Index: head/sys/vm/vm_unix.c =================================================================== --- head/sys/vm/vm_unix.c (revision 13489) +++ head/sys/vm/vm_unix.c (revision 13490) @@ -1,119 +1,121 @@ /* * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$ * * @(#)vm_unix.c 8.1 (Berkeley) 6/11/93 - * $Id: vm_unix.c,v 1.8 1995/11/12 06:43:28 bde Exp $ + * $Id: vm_unix.c,v 1.9 1995/12/07 12:48:29 davidg Exp $ */ /* * Traditional sbrk/grow interface to VM */ #include #include #include #include #include #include #include #include #include #include #include +#include #ifndef _SYS_SYSPROTO_H_ struct obreak_args { char *nsize; }; #endif /* ARGSUSED */ int obreak(p, uap, retval) struct proc *p; struct obreak_args *uap; int *retval; { register struct vmspace *vm = p->p_vmspace; vm_offset_t new, old; int rv; register int diff; old = (vm_offset_t) vm->vm_daddr; new = round_page(uap->nsize); if ((int) (new - old) > p->p_rlimit[RLIMIT_DATA].rlim_cur) return (ENOMEM); old = round_page(old + ctob(vm->vm_dsize)); diff = new - old; if (diff > 0) { if (swap_pager_full) { return (ENOMEM); } - rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE); + rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE, + VM_PROT_ALL, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (ENOMEM); } vm->vm_dsize += btoc(diff); } else if (diff < 0) { diff = -diff; rv = vm_map_remove(&vm->vm_map, new, new + diff); if (rv != KERN_SUCCESS) { return (ENOMEM); } vm->vm_dsize -= btoc(diff); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct ovadvise_args { int anom; }; #endif /* ARGSUSED */ int ovadvise(p, uap, retval) struct proc *p; struct ovadvise_args *uap; int *retval; { return (EINVAL); } Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 13489) +++ head/sys/vm/vnode_pager.c (revision 13490) @@ -1,960 +1,961 @@ /* * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 - * $Id: vnode_pager.c,v 1.56 1995/12/14 09:55:14 phk Exp $ + * $Id: vnode_pager.c,v 1.57 1995/12/17 23:29:56 dyson Exp $ */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address, int *run)); static void vnode_pager_iodone __P((struct buf *bp)); static int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m)); static int vnode_pager_input_old __P((vm_object_t object, vm_page_t m)); static void vnode_pager_dealloc __P((vm_object_t)); static int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static int vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *)); static boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); struct pagerops vnodepagerops = { NULL, vnode_pager_alloc, vnode_pager_dealloc, vnode_pager_getpages, vnode_pager_putpages, vnode_pager_haspage, NULL }; static int vnode_pager_leaf_getpages __P((vm_object_t object, vm_page_t *m, int count, int reqpage)); static int vnode_pager_leaf_putpages __P((vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals)); /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_object_t vnode_pager_alloc(handle, size, prot, offset) void *handle; vm_size_t size; vm_prot_t prot; vm_ooffset_t offset; { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; /* * Prevent race condition when allocating the object. This * can happen with NFS vnodes since the nfsnode isn't locked. */ while (vp->v_flag & VOLOCK) { vp->v_flag |= VOWANT; tsleep(vp, PVM, "vnpobj", 0); } vp->v_flag |= VOLOCK; /* * If the object is being terminated, wait for it to * go away. */ - while (((object = vp->v_object) != NULL) && (object->flags & OBJ_DEAD)) { + while (((object = vp->v_object) != NULL) && + (object->flags & OBJ_DEAD)) { tsleep(object, PVM, "vadead", 0); } if (object == NULL) { /* * And an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, size); object->flags = OBJ_CANPERSIST; /* * Hold a reference to the vnode and initialize object data. */ VREF(vp); object->un_pager.vnp.vnp_size = (vm_ooffset_t) size * PAGE_SIZE; object->handle = handle; vp->v_object = object; } else { /* * vm_object_reference() will remove the object from the cache if * found and gain a reference to the object. */ vm_object_reference(object); } if (vp->v_type == VREG) vp->v_flag |= VVMIO; vp->v_flag &= ~VOLOCK; if (vp->v_flag & VOWANT) { vp->v_flag &= ~VOWANT; wakeup(vp); } return (object); } static void vnode_pager_dealloc(object) vm_object_t object; { register struct vnode *vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); if (object->paging_in_progress) { int s = splbio(); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; tsleep(object, PVM, "vnpdea", 0); } splx(s); } object->handle = NULL; vp->v_object = NULL; vp->v_flag &= ~(VTEXT | VVMIO); vp->v_flag |= VAGE; vrele(vp); } static boolean_t vnode_pager_haspage(object, pindex, before, after) vm_object_t object; vm_pindex_t pindex; int *before; int *after; { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; /* * If filesystem no longer mounted or offset beyond end of file we do * not have the page. */ if ((vp->v_mount == NULL) || (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn, after, before); if (err) return TRUE; if ( bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { int numafter; *after *= pagesperblock; numafter = pagesperblock - (poff + 1); if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) { numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex))); } *after += numafter; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(vp, nsize) struct vnode *vp; vm_ooffset_t nsize; { vm_object_t object = vp->v_object; if (object == NULL) return; /* * Hasn't changed size */ if (nsize == object->un_pager.vnp.vnp_size) return; /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nsize < object->un_pager.vnp.vnp_size) { vm_ooffset_t nsizerounded; nsizerounded = IDX_TO_OFF(OFF_TO_IDX(nsize + PAGE_SIZE - 1)); if (nsizerounded < object->un_pager.vnp.vnp_size) { vm_object_page_remove(object, OFF_TO_IDX(nsize + PAGE_SIZE - 1), OFF_TO_IDX(object->un_pager.vnp.vnp_size), FALSE); } /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode... */ if (nsize & PAGE_MASK) { vm_offset_t kva; vm_page_t m; m = vm_page_lookup(object, OFF_TO_IDX(nsize)); if (m) { kva = vm_pager_map_page(m); bzero((caddr_t) kva + (nsize & PAGE_MASK), (int) (round_page(nsize) - nsize)); vm_pager_unmap_page(kva); } } } object->un_pager.vnp.vnp_size = nsize; object->size = OFF_TO_IDX(nsize + PAGE_SIZE - 1); } void vnode_pager_umount(mp) register struct mount *mp; { struct vnode *vp, *nvp; loop: for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Vnode can be reclaimed by getnewvnode() while we * traverse the list. */ if (vp->v_mount != mp) goto loop; /* * Save the next pointer now since uncaching may terminate the * object and render vnode invalid */ nvp = vp->v_mntvnodes.le_next; if (vp->v_object != NULL) { VOP_LOCK(vp); vnode_pager_uncache(vp); VOP_UNLOCK(vp); } } } /* * Remove vnode associated object from the object cache. * This routine must be called with the vnode locked. * * XXX unlock the vnode. * We must do this since uncaching the object may result in its * destruction which may initiate paging activity which may necessitate * re-locking the vnode. */ void vnode_pager_uncache(vp) struct vnode *vp; { vm_object_t object; /* * Not a mapped vnode */ object = vp->v_object; if (object == NULL) return; vm_object_reference(object); VOP_UNLOCK(vp); pager_cache(object, FALSE); VOP_LOCK(vp); return; } void vnode_pager_freepage(m) vm_page_t m; { PAGE_WAKEUP(m); vm_page_free(m); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static vm_offset_t vnode_pager_addr(vp, address, run) struct vnode *vp; vm_ooffset_t address; int *run; { int rtaddress; int bsize; daddr_t block; struct vnode *rtvp; int err; daddr_t vblock; int voffset; if ((int) address < 0) return -1; if (vp->v_mount == NULL) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL); if (err || (block == -1)) rtaddress = -1; else { rtaddress = block + voffset / DEV_BSIZE; if( run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return rtaddress; } /* * interrupt routine for I/O completion */ static void vnode_pager_iodone(bp) struct buf *bp; { bp->b_flags |= B_DONE; wakeup(bp); } /* * small block file system vnode pager input */ static int vnode_pager_input_smlfs(object, m) vm_object_t object; vm_page_t m; { int i; int s; struct vnode *dp, *vp; struct buf *bp; vm_offset_t kva; int fileaddr; vm_offset_t bsize; int error = 0; vp = object->handle; if (vp->v_mount == NULL) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &dp, 0, NULL, NULL); kva = vm_pager_map_page(m); for (i = 0; i < PAGE_SIZE / bsize; i++) { if ((vm_page_bits(IDX_TO_OFF(m->pindex) + i * bsize, bsize) & m->valid)) continue; fileaddr = vnode_pager_addr(vp, IDX_TO_OFF(m->pindex) + i * bsize, (int *)0); if (fileaddr != -1) { bp = getpbuf(); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_un.b_addr = (caddr_t) kva + i * bsize; bp->b_blkno = fileaddr; pbgetvp(dp, bp); bp->b_bcount = bsize; bp->b_bufsize = bsize; /* do the input */ VOP_STRATEGY(bp); /* we definitely need to be at splbio here */ s = splbio(); while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "vnsrd", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); if (error) break; vm_page_set_validclean(m, (i * bsize) & (PAGE_SIZE-1), bsize); } else { vm_page_set_validclean(m, (i * bsize) & (PAGE_SIZE-1), bsize); bzero((caddr_t) kva + i * bsize, bsize); } } vm_pager_unmap_page(kva); pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->flags &= ~PG_ZERO; if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager output routine */ static int vnode_pager_input_old(object, m) vm_object_t object; vm_page_t m; { struct uio auio; struct iovec aiov; int error; int size; vm_offset_t kva; error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ kva = vm_pager_map_page(m); aiov.iov_base = (caddr_t) kva; aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_procp = (struct proc *) 0; error = VOP_READ(object->handle, &auio, 0, curproc->p_ucred); if (!error) { register int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t) kva + count, PAGE_SIZE - count); } vm_pager_unmap_page(kva); } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->dirty = 0; m->flags &= ~PG_ZERO; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ static int vnode_pager_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { int rtval; struct vnode *vp; vp = object->handle; rtval = VOP_GETPAGES(vp, m, count*PAGE_SIZE, reqpage, 0); if (rtval == EOPNOTSUPP) return vnode_pager_leaf_getpages(object, m, count, reqpage); else return rtval; } static int vnode_pager_leaf_getpages(object, m, count, reqpage) vm_object_t object; vm_page_t *m; int count; int reqpage; { vm_offset_t kva; off_t foff; int i, size, bsize, first, firstaddr; struct vnode *dp, *vp; int runpg; int runend; struct buf *bp; int s; int error = 0; vp = object->handle; if (vp->v_mount == NULL) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; /* get the UNDERLYING device for the file with VOP_BMAP() */ /* * originally, we did not check for an error return value -- assuming * an fs always has a bmap entry point -- that assumption is wrong!!! */ foff = IDX_TO_OFF(m[reqpage]->pindex); /* * if we can't bmap, use old VOP code */ if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } cnt.v_vnodein++; cnt.v_vnodepgsin++; return vnode_pager_input_old(object, m[reqpage]); /* * if the blocksize is smaller than a page size, then use * special small filesystem code. NFS sometimes has a small * blocksize, but it can handle large reads itself. */ } else if ((PAGE_SIZE / bsize) > 1 && (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) { for (i = 0; i < count; i++) { if (i != reqpage) { vnode_pager_freepage(m[i]); } } cnt.v_vnodein++; cnt.v_vnodepgsin++; return vnode_pager_input_smlfs(object, m[reqpage]); } /* * if ANY DEV_BSIZE blocks are valid on a large filesystem block * then, the entire page is valid -- */ if (m[reqpage]->valid) { m[reqpage]->valid = VM_PAGE_BITS_ALL; for (i = 0; i < count; i++) { if (i != reqpage) vnode_pager_freepage(m[i]); } return VM_PAGER_OK; } /* * here on direct device I/O */ firstaddr = -1; /* * calculate the run that includes the required page */ for(first = 0, i = 0; i < count; i = runend) { firstaddr = vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &runpg); if (firstaddr == -1) { if (i == reqpage && foff < object->un_pager.vnp.vnp_size) { panic("vnode_pager_putpages: unexpected missing page: firstaddr: %d, foff: %ld, vnp_size: %d", firstaddr, foff, object->un_pager.vnp.vnp_size); } vnode_pager_freepage(m[i]); runend = i + 1; first = runend; continue; } runend = i + runpg; if (runend <= reqpage) { int j; for (j = i; j < runend; j++) { vnode_pager_freepage(m[j]); } } else { if (runpg < (count - first)) { for (i = first + runpg; i < count; i++) vnode_pager_freepage(m[i]); count = first + runpg; } break; } first = runend; } /* * the first and last page have been calculated now, move input pages * to be zero based... */ if (first != 0) { for (i = first; i < count; i++) { m[i - first] = m[i]; } count -= first; reqpage -= first; } /* * calculate the file virtual address for the transfer */ foff = IDX_TO_OFF(m[0]->pindex); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; if ((foff + size) > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - foff; /* * round up physical size for real devices */ if (dp->v_type == VBLK || dp->v_type == VCHR) size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); bp = getpbuf(); kva = (vm_offset_t) bp->b_data; /* * and map the pages to be read into the kva */ pmap_qenter(kva, m, count); /* build a minimal buffer header */ bp->b_flags = B_BUSY | B_READ | B_CALL; bp->b_iodone = vnode_pager_iodone; /* B_PHYS is not set, but it is nice to fill this in */ bp->b_proc = curproc; bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred; if (bp->b_rcred != NOCRED) crhold(bp->b_rcred); if (bp->b_wcred != NOCRED) crhold(bp->b_wcred); bp->b_blkno = firstaddr; pbgetvp(dp, bp); bp->b_bcount = size; bp->b_bufsize = size; cnt.v_vnodein++; cnt.v_vnodepgsin += count; /* do the input */ VOP_STRATEGY(bp); s = splbio(); /* we definitely need to be at splbio here */ while ((bp->b_flags & B_DONE) == 0) { tsleep(bp, PVM, "vnread", 0); } splx(s); if ((bp->b_flags & B_ERROR) != 0) error = EIO; if (!error) { if (size != count * PAGE_SIZE) bzero((caddr_t) kva + size, PAGE_SIZE * count - size); } pmap_qremove(kva, count); /* * free the buffer header back to the swap buffer pool */ relpbuf(bp); for (i = 0; i < count; i++) { pmap_clear_modify(VM_PAGE_TO_PHYS(m[i])); m[i]->dirty = 0; m[i]->valid = VM_PAGE_BITS_ALL; m[i]->flags &= ~PG_ZERO; if (i != reqpage) { /* * whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere. (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * just in case someone was asking for this page we * now tell them that it is ok to use */ if (!error) { vm_page_deactivate(m[i]); PAGE_WAKEUP(m[i]); } else { vnode_pager_freepage(m[i]); } } } if (error) { printf("vnode_pager_getpages: I/O read error\n"); } return (error ? VM_PAGER_ERROR : VM_PAGER_OK); } static int vnode_pager_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { int rtval; struct vnode *vp; vp = object->handle; rtval = VOP_PUTPAGES(vp, m, count*PAGE_SIZE, sync, rtvals, 0); if (rtval == EOPNOTSUPP) return vnode_pager_leaf_putpages(object, m, count, sync, rtvals); else return rtval; } /* * generic vnode pager output routine */ static int vnode_pager_leaf_putpages(object, m, count, sync, rtvals) vm_object_t object; vm_page_t *m; int count; boolean_t sync; int *rtvals; { int i; struct vnode *vp; int maxsize, ncount; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int error; vp = object->handle;; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; if ((int) m[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%x(%x)\n", m[0]->pindex, m[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(m[0]->pindex); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) maxsize = object->un_pager.vnp.vnp_size - poffset; else maxsize = 0; ncount = (maxsize + PAGE_SIZE - 1) / PAGE_SIZE; if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } #ifdef BOGUS if (ncount == 0) { printf("vnode_pager_putpages: write past end of file: %d, %lu\n", poffset, (unsigned long) object->un_pager.vnp.vnp_size); return rtvals[0]; } #endif } } for (i = 0; i < count; i++) { m[i]->busy++; m[i]->flags &= ~PG_BUSY; } aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_procp = (struct proc *) 0; error = VOP_WRITE(vp, &auio, IO_VMIO|(sync?IO_SYNC:0), curproc->p_ucred); cnt.v_vnodeout++; cnt.v_vnodepgsout += ncount; if (error) { printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { printf("vnode_pager_putpages: residual I/O %d at %ld\n", auio.uio_resid, m[0]->pindex); } for (i = 0; i < count; i++) { m[i]->busy--; if (i < ncount) { rtvals[i] = VM_PAGER_OK; } if ((m[i]->busy == 0) && (m[i]->flags & PG_WANTED)) wakeup(m[i]); } return rtvals[0]; } struct vnode * vnode_pager_lock(object) vm_object_t object; { for (; object != NULL; object = object->backing_object) { if (object->type != OBJT_VNODE) continue; VOP_LOCK(object->handle); return object->handle; } return NULL; }