diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 9c9cacd08b35..12aa8c708c95 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1,2051 +1,2053 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.348 1999/07/02 04:33:05 peter Exp $ + * $Id: machdep.c,v 1.349 1999/07/02 20:33:32 msmith Exp $ */ #include "apm.h" #include "ether.h" #include "npx.h" #include "opt_atalk.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_perfmon.h" #include "opt_smp.h" #include "opt_sysvipc.h" #include "opt_user_ldt.h" #include "opt_userconfig.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #ifdef SMP #include #include #endif #ifdef PERFMON #include #endif #ifdef OLD_BUS_ARCH #include #endif #include #include #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); static int sysctl_hw_availpages SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); static int sysctl_machdep_msgbuf SYSCTL_HANDLER_ARGS { int error; /* Unwind the buffer, so that it's linear (possibly starting with * some initial nulls). */ error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr, msgbufp->msg_size-msgbufp->msg_bufr,req); if(error) return(error); if(msgbufp->msg_bufr>0) { error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr, msgbufp->msg_bufr,req); } return(error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer"); static int msgbuf_clear; static int sysctl_machdep_msgbuf_clear SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { /* Clear the buffer and reset write pointer */ bzero(msgbufp->msg_ptr,msgbufp->msg_size); msgbufp->msg_bufr=msgbufp->msg_bufx=0; msgbuf_clear=0; } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW, &msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I", "Clear kernel message buffer"); int bootverbose = 0, Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf(version); earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 8, 2048); + if( physmem > 65536) + nbuf += (physmem - 65536) / 20; } - nswbuf = max(min(nbuf/4, 64), 16); + nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; int xclusters; /* Allow override of NMBCLUSTERS from the kernel environment */ if (getenv_int("kern.ipc.nmbclusters", &xclusters) && xclusters > nmbclusters) nmbclusters = xclusters; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size); mb_map->system_map = 1; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i]); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the APs and APICs */ mp_announce(); #endif /* SMP */ } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } void netisr_sysinit(data) void *data; { const struct netisrtab *nit; nit = (const struct netisrtab *)data; register_netisr(nit->nit_num, nit->nit_isr); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; u_long code; { register struct proc *p = curproc; register struct trapframe *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; /* * Allocate and validate space for the signal handler context. */ if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SS_ONSTACK; } else { fp = (struct sigframe *)regs->tf_esp - 1; } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow_stack (p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs->tf_err; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs->tf_eax; sf.sf_sc.sc_ebx = regs->tf_ebx; sf.sf_sc.sc_ecx = regs->tf_ecx; sf.sf_sc.sc_edx = regs->tf_edx; sf.sf_sc.sc_esi = regs->tf_esi; sf.sf_sc.sc_edi = regs->tf_edi; sf.sf_sc.sc_cs = regs->tf_cs; sf.sf_sc.sc_ds = regs->tf_ds; sf.sf_sc.sc_ss = regs->tf_ss; sf.sf_sc.sc_es = regs->tf_es; sf.sf_sc.sc_fs = regs->tf_fs; sf.sf_sc.sc_isp = regs->tf_isp; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs->tf_esp; sf.sf_sc.sc_fp = regs->tf_ebp; sf.sf_sc.sc_pc = regs->tf_eip; sf.sf_sc.sc_ps = regs->tf_eflags; sf.sf_sc.sc_trapno = regs->tf_trapno; sf.sf_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_sc.sc_gs = tf->tf_vm86_gs; sf.sf_sc.sc_fs = tf->tf_vm86_fs; sf.sf_sc.sc_es = tf->tf_vm86_es; sf.sf_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; { register struct sigcontext *scp; register struct sigframe *fp; register struct trapframe *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs->tf_esp points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0) return(EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* restore scratch registers */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = scp->sc_mask & ~sigcantmask; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return(EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_md.md_regs; struct pcb *pcb = &p->p_addr->u_pcb; #ifdef USER_LDT /* was i386_user_cleanup() in NetBSD */ if (pcb->pcb_ldt) { if (pcb == curpcb) { lldt(_default_ldt); currentldt = _default_ldt; } kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt, pcb->pcb_ldt_len * sizeof(union descriptor)); pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0; } #endif bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ pcb->pcb_gs = _udatasel; if (pcb == curpcb) { load_gs(_udatasel); } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #if NNPX > 0 /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; #ifdef SMP union descriptor gdt[NGDT * NCPU]; /* global descriptor table */ #else union descriptor gdt[NGDT]; /* global descriptor table */ #endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif #ifndef SMP extern struct segment_descriptor common_tssd, *tss_gdt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 8 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 9 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 10 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 11 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/length pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be constrained by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * If we cannot accurately determine the physical memory map, and the * value from the RTC seems dubious, trust the value of hw.physmem/MAXMEM * instead, but require a speculative probe of memory. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; int speculative_mprobe = FALSE; struct vm86frame vmf; struct vm86context vmc; vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; u_int64_t AllowMem, MaxMem, sanity; const char *cp, *ep; struct { u_int64_t base; u_int64_t length; u_int32_t type; } *smap; bzero(&vmf, sizeof(struct vm86frame)); bzero(physmap, sizeof(physmap)); /* * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. * After this calculation, AllowMem is either 0 (no memory size cap) * or the maximum memory size desired in bytes. */ AllowMem = 0; if ((cp = getenv("hw.physmem")) != NULL) { sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Warning: invalid memory limit '%s' specified\n", cp); } #ifdef MAXMEM if (AllowMem == 0) AllowMem = MAXMEM * (u_int64_t)1024; #endif if ((AllowMem != 0) && (boothowto & RB_VERBOSE)) printf("Physical memory use limited to %uk\n", (u_int)(AllowMem / 1024)); MaxMem = AllowMem; if (AllowMem == 0) AllowMem = (u_int64_t)1 << 32; /* 4GB limit imposed by 32-bit pmap */ /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ #define SMAPSIZ sizeof(*smap) #define SMAP_SIG 0x534D4150 /* 'SMAP' */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = SMAPSIZ; i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= AllowMem) { printf("%uk of memory above %uk ignored\n", (u_int)(smap->length / 1024), (u_int)(AllowMem / 1024)); goto next_run; } if ((smap->base + smap->length) >= AllowMem) { printf("%uk region truncated to %uk to fit %uk limit\n", (u_int)(smap->length / 1024), (u_int)((AllowMem - smap->base) / 1024), (u_int)(AllowMem / 1024)); smap->length = AllowMem - smap->base; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); /* * If we failed above, try memory map with INT 15:E801 */ if (physmap[1] == 0) { vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory, or * hw.physmem/MAXMEM overrides. */ if (MaxMem > (1024 * 1024)) { /* < 1MB is insane */ extmem = (MaxMem / 1024) - 1024; } else { extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); } /* * If the value from the RTC is >= 16M, there is a good * chance that it's lying. Compaq systems never report * more than 16M, and no system can honestly report more * than 64M. We should end up here only on extremely * old and broken systems. In any case, qualify the value * that we've got here by actually checking for physical * memory later on. */ if (extmem >= 16 * 1024) speculative_mprobe = TRUE; #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We fiddle it again * later based on the results of the memory test. */ Maxmem = physmap[physmap_idx + 1] / PAGE_SIZE; /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ mp_probe(); #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; if (boothowto & RB_VERBOSE) printf("Testing memory %uk to %uk\n", (u_int)(physmap[i] / 1024), (u_int)((physmap[i] + physmap[i+1]) / 1024)); end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; if (speculative_mprobe == TRUE && phys_avail[pa_indx] >= (64*1024*1024)) end += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { int x; struct gate_descriptor *gdp; int gsel_tss; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int off; /* * Prevent lowering of the ipl if we call tsleep() early. */ safepri = cpl; proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(sizeof(struct privatespace)) - 1; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0]; #else gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GPROC0_SEL].ssd_base = (int) &common_tss; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); #ifdef USER_LDT currentldt = _default_ldt; #endif /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; common_tssd = *tss_gdt; common_tss.tss_ioopt = (sizeof common_tss) << 16; ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; #ifdef SMP proc0.p_addr->u_pcb.pcb_mpnest = 1; #endif proc0.p_addr->u_pcb.pcb_ext = 0; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_md.md_regs->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_md.md_regs->tf_eflags |= PSL_T; return (0); } int ptrace_read_u_check(p, addr, len) struct proc *p; vm_offset_t addr; size_t len; { vm_offset_t gap; if ((vm_offset_t) (addr + len) < addr) return EPERM; if ((vm_offset_t) (addr + len) <= sizeof(struct user)) return 0; gap = (char *) p->p_md.md_regs - (char *) p->p_addr; if ((vm_offset_t) addr < gap) return EPERM; if ((vm_offset_t) (addr + len) <= (vm_offset_t) (gap + sizeof(struct trapframe))) return 0; return EPERM; } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; long data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = p->p_md.md_regs; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs); return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 9c9cacd08b35..12aa8c708c95 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -1,2051 +1,2053 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.348 1999/07/02 04:33:05 peter Exp $ + * $Id: machdep.c,v 1.349 1999/07/02 20:33:32 msmith Exp $ */ #include "apm.h" #include "ether.h" #include "npx.h" #include "opt_atalk.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ipx.h" #include "opt_maxmem.h" #include "opt_msgbuf.h" #include "opt_perfmon.h" #include "opt_smp.h" #include "opt_sysvipc.h" #include "opt_user_ldt.h" #include "opt_userconfig.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SYSVSHM #include #endif #ifdef SYSVMSG #include #endif #ifdef SYSVSEM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included via sys/user.h */ #ifdef SMP #include #include #endif #ifdef PERFMON #include #endif #ifdef OLD_BUS_ARCH #include #endif #include #include #include #include #include extern void init386 __P((int first)); extern void dblfault_handler __P((void)); extern void printcpuinfo(void); /* XXX header file */ extern void earlysetcpuclass(void); /* same header file */ extern void finishidentcpu(void); extern void panicifcpuunsupported(void); extern void initializecpu(void); static void cpu_startup __P((void *)); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL) static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) extern int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, CTLFLAG_RD, &swtch_optim_stats, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, CTLFLAG_RD, &tlb_flush_count, 0, ""); #endif #ifdef PC98 static int ispc98 = 1; #else static int ispc98 = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, ""); int physmem = 0; int cold = 1; static int sysctl_hw_physmem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); return (error); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_physmem, "I", ""); static int sysctl_hw_usermem SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, ctob(physmem - cnt.v_wire_count), req); return (error); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_usermem, "I", ""); static int sysctl_hw_availpages SYSCTL_HANDLER_ARGS { int error = sysctl_handle_int(oidp, 0, i386_btop(avail_end - avail_start), req); return (error); } SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_hw_availpages, "I", ""); static int sysctl_machdep_msgbuf SYSCTL_HANDLER_ARGS { int error; /* Unwind the buffer, so that it's linear (possibly starting with * some initial nulls). */ error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr, msgbufp->msg_size-msgbufp->msg_bufr,req); if(error) return(error); if(msgbufp->msg_bufr>0) { error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr, msgbufp->msg_bufr,req); } return(error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer"); static int msgbuf_clear; static int sysctl_machdep_msgbuf_clear SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { /* Clear the buffer and reset write pointer */ bzero(msgbufp->msg_ptr,msgbufp->msg_size); msgbufp->msg_bufr=msgbufp->msg_bufx=0; msgbuf_clear=0; } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW, &msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I", "Clear kernel message buffer"); int bootverbose = 0, Maxmem = 0; long dumplo; vm_offset_t phys_avail[10]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2) static vm_offset_t buffer_sva, buffer_eva; vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void cpu_startup(dummy) void *dummy; { register unsigned i; register caddr_t v; vm_offset_t maxaddr; vm_size_t size = 0; int firstaddr; vm_offset_t minaddr; if (boothowto & RB_VERBOSE) bootverbose++; /* * Good {morning,afternoon,evening,night}. */ printf(version); earlysetcpuclass(); startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif printf("real memory = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { int size1 = phys_avail[indx + 1] - phys_avail[indx]; printf("0x%08x - 0x%08x, %u bytes (%u pages)\n", phys_avail[indx], phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE); } } /* * Calculate callout wheel size */ for (callwheelsize = 1, callwheelbits = 0; callwheelsize < ncallout; callwheelsize <<= 1, ++callwheelbits) ; callwheelmask = callwheelsize - 1; /* * Allocate space for system data structures. * The first available kernel virtual address is in "v". * As pages of kernel virtual memory are allocated, "v" is incremented. * As pages of memory are allocated and cleared, * "firstaddr" is incremented. * An index into the kernel page table corresponding to the * virtual memory address maintained in "v" is kept in "mapaddr". */ /* * Make two passes. The first pass calculates how much memory is * needed and allocates it. The second pass assigns virtual * addresses to the various data structures. */ firstaddr = 0; again: v = (caddr_t)firstaddr; #define valloc(name, type, num) \ (name) = (type *)v; v = (caddr_t)((name)+(num)) #define valloclim(name, type, num, lim) \ (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) valloc(callout, struct callout, ncallout); valloc(callwheel, struct callout_tailq, callwheelsize); #ifdef SYSVSHM valloc(shmsegs, struct shmid_ds, shminfo.shmmni); #endif #ifdef SYSVSEM valloc(sema, struct semid_ds, seminfo.semmni); valloc(sem, struct sem, seminfo.semmns); /* This is pretty disgusting! */ valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int)); #endif #ifdef SYSVMSG valloc(msgpool, char, msginfo.msgmax); valloc(msgmaps, struct msgmap, msginfo.msgseg); valloc(msghdrs, struct msg, msginfo.msgtql); valloc(msqids, struct msqid_ds, msginfo.msgmni); #endif if (nbuf == 0) { nbuf = 30; if( physmem > 1024) nbuf += min((physmem - 1024) / 8, 2048); + if( physmem > 65536) + nbuf += (physmem - 65536) / 20; } - nswbuf = max(min(nbuf/4, 64), 16); + nswbuf = max(min(nbuf/4, 256), 16); valloc(swbuf, struct buf, nswbuf); valloc(buf, struct buf, nbuf); /* * End of first pass, size has been calculated so allocate memory */ if (firstaddr == 0) { size = (vm_size_t)(v - firstaddr); firstaddr = (int)kmem_alloc(kernel_map, round_page(size)); if (firstaddr == 0) panic("startup: no room for tables"); goto again; } /* * End of second pass, addresses have been assigned */ if ((vm_size_t)(v - firstaddr) != size) panic("startup: table size inconsistency"); clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva, (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE)); pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva, (nswbuf*MAXPHYS) + pager_map_size); pager_map->system_map = 1; exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, (16*(ARG_MAX+(PAGE_SIZE*3)))); /* * Finally, allocate mbuf pool. Since mclrefcnt is an off-size * we use the more space efficient malloc in place of kmem_alloc. */ { vm_offset_t mb_map_size; int xclusters; /* Allow override of NMBCLUSTERS from the kernel environment */ if (getenv_int("kern.ipc.nmbclusters", &xclusters) && xclusters > nmbclusters) nmbclusters = xclusters; mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES; mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE)); mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT); bzero(mclrefcnt, mb_map_size / MCLBYTES); mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr, mb_map_size); mb_map->system_map = 1; } /* * Initialize callouts */ SLIST_INIT(&callfree); for (i = 0; i < ncallout; i++) { callout_init(&callout[i]); callout[i].c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle); } for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&callwheel[i]); } #if defined(USERCONFIG) userconfig(); cninit(); /* the preferred console may have changed */ #endif printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count), ptoa(cnt.v_free_count) / 1024); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! */ mp_start(); /* fire up the APs and APICs */ mp_announce(); #endif /* SMP */ } int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } void netisr_sysinit(data) void *data; { const struct netisrtab *nit; nit = (const struct netisrtab *)data; register_netisr(nit->nit_num, nit->nit_isr); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(catcher, sig, mask, code) sig_t catcher; int sig, mask; u_long code; { register struct proc *p = curproc; register struct trapframe *regs; register struct sigframe *fp; struct sigframe sf; struct sigacts *psp = p->p_sigacts; int oonstack; regs = p->p_md.md_regs; oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; /* * Allocate and validate space for the signal handler context. */ if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack && (psp->ps_sigonstack & sigmask(sig))) { fp = (struct sigframe *)(psp->ps_sigstk.ss_sp + psp->ps_sigstk.ss_size - sizeof(struct sigframe)); psp->ps_sigstk.ss_flags |= SS_ONSTACK; } else { fp = (struct sigframe *)regs->tf_esp - 1; } /* * grow() will return FALSE if the fp will not fit inside the stack * and the stack can not be grown. useracc will return FALSE * if access is denied. */ if ((grow_stack (p, (int)fp) == FALSE) || (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ SIGACTION(p, SIGILL) = SIG_DFL; sig = sigmask(SIGILL); p->p_sigignore &= ~sig; p->p_sigcatch &= ~sig; p->p_sigmask &= ~sig; psignal(p, SIGILL); return; } /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) { if (sig < p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[sig]; else sig = p->p_sysent->sv_sigsize + 1; } sf.sf_signum = sig; sf.sf_code = code; sf.sf_scp = &fp->sf_sc; sf.sf_addr = (char *) regs->tf_err; sf.sf_handler = catcher; /* save scratch registers */ sf.sf_sc.sc_eax = regs->tf_eax; sf.sf_sc.sc_ebx = regs->tf_ebx; sf.sf_sc.sc_ecx = regs->tf_ecx; sf.sf_sc.sc_edx = regs->tf_edx; sf.sf_sc.sc_esi = regs->tf_esi; sf.sf_sc.sc_edi = regs->tf_edi; sf.sf_sc.sc_cs = regs->tf_cs; sf.sf_sc.sc_ds = regs->tf_ds; sf.sf_sc.sc_ss = regs->tf_ss; sf.sf_sc.sc_es = regs->tf_es; sf.sf_sc.sc_fs = regs->tf_fs; sf.sf_sc.sc_isp = regs->tf_isp; /* * Build the signal context to be used by sigreturn. */ sf.sf_sc.sc_onstack = oonstack; sf.sf_sc.sc_mask = mask; sf.sf_sc.sc_sp = regs->tf_esp; sf.sf_sc.sc_fp = regs->tf_ebp; sf.sf_sc.sc_pc = regs->tf_eip; sf.sf_sc.sc_ps = regs->tf_eflags; sf.sf_sc.sc_trapno = regs->tf_trapno; sf.sf_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; sf.sf_sc.sc_gs = tf->tf_vm86_gs; sf.sf_sc.sc_fs = tf->tf_vm86_fs; sf.sf_sc.sc_es = tf->tf_vm86_es; sf.sf_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * We should never have PSL_T set when returning from vm86 * mode. It may be set here if we deliver a signal before * getting to vm86 mode, so turn it off. * * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ sigexit(p, SIGILL); } regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sigreturn(p, uap) struct proc *p; struct sigreturn_args /* { struct sigcontext *sigcntxp; } */ *uap; { register struct sigcontext *scp; register struct sigframe *fp; register struct trapframe *regs = p->p_md.md_regs; int eflags; /* * (XXX old comment) regs->tf_esp points to the return address. * The user scp pointer is above that. * The return address is faked in the signal trampoline code * for consistency. */ scp = uap->sigcntxp; fp = (struct sigframe *) ((caddr_t)scp - offsetof(struct sigframe, sf_sc)); if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0) return(EFAULT); eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (p->p_addr->u_pcb.pcb_ext == 0) return (EINVAL); vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* go back to user mode if both flags are set */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) trapsignal(p, SIGBUS, 0); if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_eflags for faults. Debuggers * should sometimes set it there too. tf_eflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) { #ifdef DEBUG printf("sigreturn: eflags = 0x%x\n", eflags); #endif return(EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(scp->sc_cs)) { #ifdef DEBUG printf("sigreturn: cs = 0x%x\n", scp->sc_cs); #endif trapsignal(p, SIGBUS, T_PROTFLT); return(EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* restore scratch registers */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0) return(EINVAL); if (scp->sc_onstack & 01) p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK; else p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK; p->p_sigmask = scp->sc_mask & ~sigcantmask; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; return(EJUSTRETURN); } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) __asm__ ("hlt"); } /* * Clear registers on exec */ void setregs(p, entry, stack, ps_strings) struct proc *p; u_long entry; u_long stack; u_long ps_strings; { struct trapframe *regs = p->p_md.md_regs; struct pcb *pcb = &p->p_addr->u_pcb; #ifdef USER_LDT /* was i386_user_cleanup() in NetBSD */ if (pcb->pcb_ldt) { if (pcb == curpcb) { lldt(_default_ldt); currentldt = _default_ldt; } kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt, pcb->pcb_ldt_len * sizeof(union descriptor)); pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0; } #endif bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = entry; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = ps_strings; /* reset %gs as well */ pcb->pcb_gs = _udatasel; if (pcb == curpcb) { load_gs(_udatasel); } /* * Initialize the math emulator (if any) for the current process. * Actually, just clear the bit that says that the emulator has * been initialized. Initialization is delayed until the process * traps to the emulator (if it is done at all) mainly because * emulators don't provide an entry point for initialization. */ p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP; /* * Arrange to trap the next npx or `fwait' instruction (see npx.c * for why fwait must be trapped at least if there is an npx or an * emulator). This is mainly to handle the case where npx0 is not * configured, since the npx routines normally set up the trap * otherwise. It should be done only at boot time, but doing it * here allows modifying `npx_exists' for testing the emulator on * systems with an npx. */ load_cr0(rcr0() | CR0_MP | CR0_TS); #if NNPX > 0 /* Initialize the npx (if any) for the current process. */ npxinit(__INITIAL_NPXCW__); #endif /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ p->p_retval[1] = 0; } static int sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) resettodr(); return (error); } SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, ""); SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, CTLFLAG_RD, &bootinfo, bootinfo, ""); SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, CTLFLAG_RW, &wall_cmos_clock, 0, ""); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; #ifdef SMP union descriptor gdt[NGDT * NCPU]; /* global descriptor table */ #else union descriptor gdt[NGDT]; /* global descriptor table */ #endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ #ifdef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif #ifndef SMP extern struct segment_descriptor common_tssd, *tss_gdt; #endif int private_tss; /* flag indicating private tss */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern struct user *proc0paddr; /* software prototypes -- in more palatable form */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { 0x0, /* segment base address */ 0x0, /* length */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GCODE_SEL 1 Code Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GDATA_SEL 2 Data Descriptor for kernel */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GPROC0_SEL 4 Proc 0 Tss Descriptor */ { 0x0, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GLDT_SEL 5 LDT Descriptor */ { (int) ldt, /* segment base address */ sizeof(ldt)-1, /* length - all address space */ SDT_SYSLDT, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GUSERLDT_SEL 6 User LDT Descriptor per process */ { (int) ldt, /* segment base address */ (512 * sizeof(union descriptor)-1), /* length */ SDT_SYSLDT, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GTGATE_SEL 7 Null Descriptor - Placeholder */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GPANIC_SEL 8 Panic Tss Descriptor */ { (int) &dblfault_tss, /* segment base address */ sizeof(struct i386tss)-1,/* length - all address space */ SDT_SYS386TSS, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* unused - default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* GAPMCODE32_SEL 9 APM BIOS 32-bit interface (32bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMCODE16_SEL 10 APM BIOS 32-bit interface (16bit Code) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMERA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* GAPMDATA_SEL 11 APM BIOS 32-bit interface (Data) */ { 0, /* segment base address (overwritten by APM) */ 0xfffff, /* length */ SDT_MEMRWA, /* segment type */ 0, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Code Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMERA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, /* Null Descriptor - overwritten by call gate */ { 0x0, /* segment base address */ 0x0, /* length - all address space */ 0, /* segment type */ 0, /* segment descriptor priority level */ 0, /* segment descriptor present */ 0, 0, 0, /* default 32 vs 16 bit size */ 0 /* limit granularity (byte/page units)*/ }, /* Data Descriptor for user */ { 0x0, /* segment base address */ 0xfffff, /* length - all address space */ SDT_MEMRWA, /* segment type */ SEL_UPL, /* segment descriptor priority level */ 1, /* segment descriptor present */ 0, 0, 1, /* default 32 vs 16 bit size */ 1 /* limit granularity (byte/page units)*/ }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(syscall), IDTVEC(int0x80_syscall); void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } #define PHYSMAP_SIZE (2 * 8) /* * Populate the (physmap) array with base/length pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be constrained by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * If we cannot accurately determine the physical memory map, and the * value from the RTC seems dubious, trust the value of hw.physmem/MAXMEM * instead, but require a speculative probe of memory. */ static void getmemsize(int first) { int i, physmap_idx, pa_indx; u_int basemem, extmem; int speculative_mprobe = FALSE; struct vm86frame vmf; struct vm86context vmc; vm_offset_t pa, physmap[PHYSMAP_SIZE]; pt_entry_t pte; u_int64_t AllowMem, MaxMem, sanity; const char *cp, *ep; struct { u_int64_t base; u_int64_t length; u_int32_t type; } *smap; bzero(&vmf, sizeof(struct vm86frame)); bzero(physmap, sizeof(physmap)); /* * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes * for the appropriate modifiers. * After this calculation, AllowMem is either 0 (no memory size cap) * or the maximum memory size desired in bytes. */ AllowMem = 0; if ((cp = getenv("hw.physmem")) != NULL) { sanity = AllowMem = strtouq(cp, &ep, 0); if ((ep != cp) && (*ep != 0)) { switch(*ep) { case 'g': case 'G': AllowMem <<= 10; case 'm': case 'M': AllowMem <<= 10; case 'k': case 'K': AllowMem <<= 10; break; default: AllowMem = sanity = 0; } if (AllowMem < sanity) AllowMem = 0; } if (AllowMem == 0) printf("Warning: invalid memory limit '%s' specified\n", cp); } #ifdef MAXMEM if (AllowMem == 0) AllowMem = MAXMEM * (u_int64_t)1024; #endif if ((AllowMem != 0) && (boothowto & RB_VERBOSE)) printf("Physical memory use limited to %uk\n", (u_int)(AllowMem / 1024)); MaxMem = AllowMem; if (AllowMem == 0) AllowMem = (u_int64_t)1 << 32; /* 4GB limit imposed by 32-bit pmap */ /* * Perform "base memory" related probes & setup */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) { pte = (pt_entry_t)vtopte(pa + KERNBASE); *pte = pa | PG_RW | PG_V; } /* * if basemem != 640, map pages r/w into vm86 page table so * that the bios can scribble on it. */ pte = (pt_entry_t)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; /* * map page 1 R/W into the kernel page table so we can use it * as a buffer. The kernel will unmap this page later. */ pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT)); *pte = (1 << PAGE_SHIFT) | PG_RW | PG_V; /* * get memory map with INT 15:E820 */ #define SMAPSIZ sizeof(*smap) #define SMAP_SIG 0x534D4150 /* 'SMAP' */ vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); physmap_idx = 0; vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = SMAPSIZ; i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n", smap->type, *(u_int32_t *)((char *)&smap->base + 4), (u_int32_t)smap->base, *(u_int32_t *)((char *)&smap->length + 4), (u_int32_t)smap->length); if (smap->type != 0x01) goto next_run; if (smap->length == 0) goto next_run; if (smap->base >= AllowMem) { printf("%uk of memory above %uk ignored\n", (u_int)(smap->length / 1024), (u_int)(AllowMem / 1024)); goto next_run; } if ((smap->base + smap->length) >= AllowMem) { printf("%uk region truncated to %uk to fit %uk limit\n", (u_int)(smap->length / 1024), (u_int)((AllowMem - smap->base) / 1024), (u_int)(AllowMem / 1024)); smap->length = AllowMem - smap->base; } for (i = 0; i <= physmap_idx; i += 2) { if (smap->base < physmap[i + 1]) { if (boothowto & RB_VERBOSE) printf( "Overlapping or non-montonic memory region, ignoring second region\n"); goto next_run; } } if (smap->base == physmap[physmap_idx + 1]) { physmap[physmap_idx + 1] += smap->length; goto next_run; } physmap_idx += 2; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); break; } physmap[physmap_idx] = smap->base; physmap[physmap_idx + 1] = smap->base + smap->length; next_run: } while (vmf.vmf_ebx != 0); /* * If we failed above, try memory map with INT 15:E801 */ if (physmap[1] == 0) { vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else /* * Prefer the RTC value for extended memory, or * hw.physmem/MAXMEM overrides. */ if (MaxMem > (1024 * 1024)) { /* < 1MB is insane */ extmem = (MaxMem / 1024) - 1024; } else { extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); } /* * If the value from the RTC is >= 16M, there is a good * chance that it's lying. Compaq systems never report * more than 16M, and no system can honestly report more * than 64M. We should end up here only on extremely * old and broken systems. In any case, qualify the value * that we've got here by actually checking for physical * memory later on. */ if (extmem >= 16 * 1024) speculative_mprobe = TRUE; #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We fiddle it again * later based on the results of the memory test. */ Maxmem = physmap[physmap_idx + 1] / PAGE_SIZE; /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1] / 1024); /* look for the MP hardware - needed for apic addresses */ mp_probe(); #endif /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first, 0); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; #if 0 pte = (pt_entry_t)vtopte(KERNBASE); #else pte = (pt_entry_t)CMAP1; #endif /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_offset_t end; if (boothowto & RB_VERBOSE) printf("Testing memory %uk to %uk\n", (u_int)(physmap[i] / 1024), (u_int)((physmap[i] + physmap[i+1]) / 1024)); end = ptoa(Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad; #if 0 int *ptr = 0; #else int *ptr = (int *)CADDR1; #endif /* * block out kernel memory as not available. */ if (pa >= 0x100000 && pa < first) continue; page_bad = FALSE; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) { page_bad = TRUE; } /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) { page_bad = TRUE; } /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) { page_bad = TRUE; } /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) { page_bad = TRUE; } /* * Restore original value. */ *(int *)ptr = tmp; /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) { continue; } /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; if (speculative_mprobe == TRUE && phys_avail[pa_indx] >= (64*1024*1024)) end += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf("Too many holes in the physical address space, giving up\n"); pa_indx--; break; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); avail_end = phys_avail[pa_indx]; } void init386(first) int first; { int x; struct gate_descriptor *gdp; int gsel_tss; #ifndef SMP /* table descriptors - used to load tables by microp */ struct region_descriptor r_gdt, r_idt; #endif int off; /* * Prevent lowering of the ipl if we call tsleep() early. */ safepri = cpl; proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } if (bootinfo.bi_envp) kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; /* * make gdt memory segments, the code segment goes up to end of the * page with etext in it, the data segment goes to the end of * the address space */ /* * XXX text protection is temporarily (?) disabled. The limit was * i386_btop(round_page(etext)) - 1. */ gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1; #ifdef SMP gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(sizeof(struct privatespace)) - 1; gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0]; gdt_segs[GPROC0_SEL].ssd_base = (int) &SMP_prvspace[0].globaldata.gd_common_tss; SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0]; #else gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(0) - 1; gdt_segs[GPROC0_SEL].ssd_base = (int) &common_tss; #endif for (x = 0; x < NGDT; x++) { #ifdef BDE_DEBUGGER /* avoid overwriting db entries with APM ones */ if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL) continue; #endif ssdtosd(&gdt_segs[x], &gdt[x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we * don't want the user area to be writable in copyout() etc. (page * level protection is lost in kernel mode on 386's). Also, we * don't want the user area to be writable directly (page level * protection of the user area is not available on 486's with * CR0_WP set, because there is no user-read/kernel-write mode). * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it * should be spelled ...MAX_USER... */ #define VM_END_USER_RW_ADDRESS VM_MAXUSER_ADDRESS /* * The code segment limit has to cover the user area until we move * the signal trampoline out of the user area. This is safe because * the code segment cannot be written to directly. */ #define VM_END_USER_R_ADDRESS (VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE) ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1; ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1; for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); #ifdef USER_LDT currentldt = _default_ldt; #endif /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(1, &IDTVEC(dbg), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(3, &IDTVEC(bpt), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); /* * Initialize the console before we print anything out. */ cninit(); #include "isa.h" #if NISA >0 isa_defaultirq(); #endif rand_initialize(); #ifdef DDB kdb_init(); if (boothowto & RB_KDB) Debugger("Boot flags requested debugger"); #endif finishidentcpu(); /* Final stage of CPU initialization */ setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; common_tssd = *tss_gdt; common_tss.tss_ioopt = (sizeof common_tss) << 16; ltr(gsel_tss); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_cr3 = (int)IdlePTD; dblfault_tss.tss_eip = (int) dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); vm86_initialize(); getmemsize(first); /* now running on new page tables, configured,and u/iom is accessible */ /* Map the message buffer. */ for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); msgbufinit(msgbufp, MSGBUF_SIZE); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(syscall); gdp->gd_looffset = x++; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16; /* XXX does this work? */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = LSEL(LUCODE_SEL, SEL_UPL); _udatasel = LSEL(LUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; #ifdef SMP proc0.p_addr->u_pcb.pcb_mpnest = 1; #endif proc0.p_addr->u_pcb.pcb_ext = 0; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; #ifndef SMP struct region_descriptor r_idt; #endif vm_offset_t tmp; if (!has_f00f_bug) return; printf("Intel Pentium detected, installing workaround for F00F bug\n"); r_idt.rd_limit = sizeof(idt0) - 1; tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2); if (tmp == 0) panic("kmem_alloc returned 0"); if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) panic("kmem_alloc returned non-page-aligned memory"); /* Put the first seven entries in the lower page */ new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (int)new_idt; lidt(&r_idt); idt = new_idt; if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE, VM_PROT_READ, FALSE) != KERN_SUCCESS) panic("vm_map_protect failed"); return; } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ int ptrace_set_pc(p, addr) struct proc *p; unsigned long addr; { p->p_md.md_regs->tf_eip = addr; return (0); } int ptrace_single_step(p) struct proc *p; { p->p_md.md_regs->tf_eflags |= PSL_T; return (0); } int ptrace_read_u_check(p, addr, len) struct proc *p; vm_offset_t addr; size_t len; { vm_offset_t gap; if ((vm_offset_t) (addr + len) < addr) return EPERM; if ((vm_offset_t) (addr + len) <= sizeof(struct user)) return 0; gap = (char *) p->p_md.md_regs - (char *) p->p_addr; if ((vm_offset_t) addr < gap) return EPERM; if ((vm_offset_t) (addr + len) <= (vm_offset_t) (gap + sizeof(struct trapframe))) return 0; return EPERM; } int ptrace_write_u(p, off, data) struct proc *p; vm_offset_t off; long data; { struct trapframe frame_copy; vm_offset_t min; struct trapframe *tp; /* * Privileged kernel state is scattered all over the user area. * Only allow write access to parts of regs and to fpregs. */ min = (char *)p->p_md.md_regs - (char *)p->p_addr; if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) { tp = p->p_md.md_regs; frame_copy = *tp; *(int *)((char *)&frame_copy + (off - min)) = data; if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) || !CS_SECURE(frame_copy.tf_cs)) return (EINVAL); *(int*)((char *)p->p_addr + off) = data; return (0); } min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu); if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) { *(int*)((char *)p->p_addr + off) = data; return (0); } return (EFAULT); } int fill_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; pcb = &p->p_addr->u_pcb; regs->r_gs = pcb->pcb_gs; return (0); } int set_regs(p, regs) struct proc *p; struct reg *regs; { struct pcb *pcb; struct trapframe *tp; tp = p->p_md.md_regs; if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb = &p->p_addr->u_pcb; pcb->pcb_gs = regs->r_gs; return (0); } int fill_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs); return (0); } int set_fpregs(p, fpregs) struct proc *p; struct fpreg *fpregs; { bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs); return (0); } #ifndef DDB void Debugger(const char *msg) { printf("Debugger(\"%s\") called.\n", msg); } #endif /* no DDB */ #include /* * Determine the size of the transfer, and make sure it is * within the boundaries of the partition. Adjust transfer * if needed, and signal errors or early completion. */ int bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel) { struct partition *p = lp->d_partitions + dkpart(bp->b_dev); int labelsect = lp->d_partitions[0].p_offset; int maxsz = p->p_size, sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* overwriting disk label ? */ /* XXX should also protect bootstrap in first 8K */ if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect && #if LABELSECTOR != 0 bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect && #endif (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #if defined(DOSBBSECTOR) && defined(notyet) /* overwriting master boot record? */ if (bp->b_blkno + p->p_offset <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && wlabel == 0) { bp->b_error = EROFS; goto bad; } #endif /* beyond partition? */ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { /* if exactly at end of disk, return an EOF */ if (bp->b_blkno == maxsz) { bp->b_resid = bp->b_bcount; return(0); } /* or truncate if part of it fits */ sz = maxsz - bp->b_blkno; if (sz <= 0) { bp->b_error = EINVAL; goto bad; } bp->b_bcount = sz << DEV_BSHIFT; } bp->b_pblkno = bp->b_blkno + p->p_offset; return(1); bad: bp->b_flags |= B_ERROR; return(-1); } #ifdef DDB /* * Provide inb() and outb() as functions. They are normally only * available as macros calling inlined functions, thus cannot be * called inside DDB. * * The actual code is stolen from , and de-inlined. */ #undef inb #undef outb /* silence compiler warnings */ u_char inb(u_int); void outb(u_int, u_char); u_char inb(u_int port) { u_char data; /* * We use %%dx and not %1 here because i/o is done at %dx and not at * %edx, while gcc generates inferior code (movw instead of movl) * if we tell it to load (u_short) port. */ __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); return (data); } void outb(u_int port, u_char data) { u_char al; /* * Use an unnecessary assignment to help gcc's register allocator. * This make a large difference for gcc-1.40 and a tiny difference * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for * best results. gcc-2.6.0 can't handle this. */ al = data; __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); } #endif /* DDB */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 00293bed6505..5c478c6ccb4a 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,2973 +1,3056 @@ /* * Copyright (c) 1994,1997 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * - * $Id: vfs_bio.c,v 1.218 1999/06/28 15:32:10 peter Exp $ + * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $ */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #define VMIO #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf *buf; /* buffer header pool */ struct swqueue bswlist; static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m); static void vfs_clean_pages(struct buf * bp); static void vfs_setdirty(struct buf *bp); static void vfs_vmio_release(struct buf *bp); -static void flushdirtybuffers(int slpflag, int slptimeo); static int flushbufqueues(void); +static int bd_request; + +static void buf_daemon __P((void)); /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; int runningbufspace; static vm_offset_t bogus_offset; -static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, +static int bufspace, maxbufspace, vmiospace, bufmallocspace, maxbufmallocspace, hibufspace; +#if 0 +static int maxvmiobufspace; +#endif static int needsbuffer; static int numdirtybuffers, lodirtybuffers, hidirtybuffers; static int numfreebuffers, lofreebuffers, hifreebuffers; +static int getnewbufcalls; +static int getnewbufloops; +static int getnewbufloops1; +static int getnewbufloops2; +static int getnewbufloops3; +static int getnewbufrestarts; static int kvafreespace; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, ""); +#if 0 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, &maxvmiobufspace, 0, ""); +#endif SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, &vmiospace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, ""); SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, &kvafreespace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, + &getnewbufcalls, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW, + &getnewbufloops, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW, + &getnewbufloops1, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW, + &getnewbufloops2, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW, + &getnewbufloops3, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, + &getnewbufrestarts, 0, ""); static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; char *buf_wmesg = BUF_WMESG; extern int vm_swap_size; #define BUF_MAXUSE 24 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_RESERVED02 0x02 /* unused */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ #define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ /* * kvaspacewakeup: * * Called when kva space is potential available for recovery or when * kva space is recovered in the buffer_map. This function wakes up * anyone waiting for buffer_map kva space. Even though the buffer_map * is larger then maxbufspace, this situation will typically occur * when the buffer_map gets fragmented. */ static __inline void kvaspacewakeup(void) { /* * If someone is waiting for KVA space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ if (needsbuffer & VFS_BIO_NEED_KVASPACE) { needsbuffer &= ~VFS_BIO_NEED_KVASPACE; wakeup(&needsbuffer); } } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery or when * buffer space is recovered. getnewbuf() will block on this flag when * it is unable to free sufficient buffer space. Buffer space becomes * recoverable when bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } } /* * bufcountwakeup: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountwakeup(void) { ++numfreebuffers; if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline__ void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } +static __inline__ +void +bd_wakeup(int dirtybuflevel) +{ + if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { + bd_request = 1; + wakeup(&bd_request); + } +} + /* * Initialize buffer headers and related structures. */ void bufinit() { struct buf *bp; int i; TAILQ_INIT(&bswlist); LIST_INIT(&invalhash); simple_lock_init(&buftimelock); /* first, make a null hash table */ for (i = 0; i < BUFHSZ; i++) LIST_INIT(&bufhashtbl[i]); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); LIST_INSERT_HEAD(&invalhash, bp, b_hash); } /* * maxbufspace is currently calculated to support all filesystem * blocks to be 8K. If you happen to use a 16K filesystem, the size * of the buffer cache is still the same as it would be for 8K * filesystems. This keeps the size of the buffer cache "in check" * for big block filesystems. * * maxbufspace is calculated as around 50% of the KVA available in * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the * effect of fragmentation. */ maxbufspace = (nbuf + 8) * DFLTBSIZE; if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE) hibufspace = 3 * maxbufspace / 4; +#if 0 /* * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed */ maxvmiobufspace = 2 * hibufspace / 3; +#endif /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ - lodirtybuffers = nbuf / 16 + 10; - hidirtybuffers = nbuf / 8 + 20; + lodirtybuffers = nbuf / 6 + 10; + hidirtybuffers = nbuf / 3 + 20; numdirtybuffers = 0; /* * Try to keep the number of free buffers in the specified range, * and give the syncer access to an emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; kvafreespace = 0; bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); bogus_page = vm_page_alloc(kernel_object, ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); } /* * Free the kva allocation for a buffer * Must be called only at splbio or higher, * as this is the only locking for buffer_map. */ static void bfreekva(struct buf * bp) { if (bp->b_kvasize) { vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); bp->b_kvasize = 0; kvaspacewakeup(); } } /* * bremfree: * * Remove the buffer from the appropriate free list. */ void bremfree(struct buf * bp) { int s = splbio(); int old_qindex = bp->b_qindex; if (bp->b_qindex != QUEUE_NONE) { - if (bp->b_qindex == QUEUE_EMPTY) { + if (bp->b_qindex == QUEUE_EMPTYKVA) { kvafreespace -= bp->b_kvasize; } if (BUF_REFCNT(bp) == 1) TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); else if (BUF_REFCNT(bp) == 0) panic("bremfree: not locked"); else /* Temporary panic to verify exclusive locking */ /* This panic goes away when we allow shared refs */ panic("bremfree: multiple refs"); bp->b_qindex = QUEUE_NONE; runningbufspace += bp->b_bufsize; } else { #if !defined(MAX_PERF) panic("bremfree: removing a buffer when not on a queue"); #endif } /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, and it was on the EMPTY, LRU, or AGE queues, * the buffer was free and we must decrement numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { switch(old_qindex) { + case QUEUE_DIRTY: + case QUEUE_CLEAN: case QUEUE_EMPTY: - case QUEUE_LRU: - case QUEUE_AGE: + case QUEUE_EMPTYKVA: --numfreebuffers; break; default: break; } } splx(s); } /* * Get a buffer with the specified data. Look in the cache first. We * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything ( see * getblk() ). */ int bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, struct buf ** bpp) { struct buf *bp; bp = getblk(vp, blkno, size, 0, 0); *bpp = bp; /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_flags |= B_READ; bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); return (biowait(bp)); } return (0); } /* * Operates like bread, but also starts asynchronous I/O on * read-ahead blocks. We must clear B_ERROR and B_INVAL prior * to initiating I/O . If B_CACHE is set, the buffer is valid * and we do not have to do anything. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf ** bpp) { struct buf *bp, *rabp; int i; int rv = 0, readwait = 0; *bpp = bp = getblk(vp, blkno, size, 0, 0); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; bp->b_flags |= B_READ; bp->b_flags &= ~(B_ERROR | B_INVAL); if (bp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); bp->b_rcred = cred; } vfs_busy_pages(bp, 0); VOP_STRATEGY(vp, bp); ++readwait; } for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (curproc != NULL) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_READ | B_ASYNC; rabp->b_flags &= ~(B_ERROR | B_INVAL); if (rabp->b_rcred == NOCRED) { if (cred != NOCRED) crhold(cred); rabp->b_rcred = cred; } vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); VOP_STRATEGY(vp, rabp); } else { brelse(rabp); } } if (readwait) { rv = biowait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bwrite(struct buf * bp) { int oldflags, s; struct vnode *vp; struct mount *mp; if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("bwrite: buffer is not busy???"); #endif s = splbio(); bundirty(bp); bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); bp->b_flags |= B_WRITEINPROG | B_CACHE; bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); if (curproc != NULL) curproc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) BUF_KERNPROC(bp); VOP_STRATEGY(bp->b_vp, bp); /* * Collect statistics on synchronous and asynchronous writes. * Writes to block devices are charged to their associated * filesystem (if any). */ if ((vp = bp->b_vp) != NULL) { if (vp->v_type == VBLK) mp = vp->v_specmountpoint; else mp = vp->v_mount; if (mp != NULL) { if ((oldflags & B_ASYNC) == 0) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; } } if ((oldflags & B_ASYNC) == 0) { int rtval = biowait(bp); brelse(bp); return (rtval); } return (0); } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf * bp) { struct vnode *vp; #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("bdwrite: buffer is not busy"); #endif if (bp->b_flags & B_INVAL) { brelse(bp); return; } bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (bp->b_lblkno == bp->b_blkno) { VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty pages. */ vfs_setdirty(bp); /* * We need to do this here to satisfy the vnode_pager and the * pageout daemon, so that it thinks that the pages have been * "cleaned". Note that since the pages are in a delayed write * buffer -- the VFS layer "will" see that the pages get written * out on the next sync, or perhaps the cluster will be completed. */ vfs_clean_pages(bp); bqrelse(bp); + /* + * Wakeup the buffer flushing daemon if we have saturated the + * buffer cache. + */ + + bd_wakeup(hidirtybuffers); + /* * XXX The soft dependency code is not prepared to * have I/O done when a bdwrite is requested. For * now we just let the write be delayed if it is * requested by the soft dependency code. */ if ((vp = bp->b_vp) && ((vp->v_type == VBLK && vp->v_specmountpoint && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) return; - - if (numdirtybuffers >= hidirtybuffers) - flushdirtybuffers(0, 0); } /* * bdirty: * * Turn buffer into delayed write request. We must clear B_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bdirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_READ|B_RELBUF); if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= B_DONE | B_DELWRI; reassignbuf(bp, bp->b_vp); ++numdirtybuffers; + bd_wakeup(hidirtybuffers); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * Must be called at splbio(). * The buffer must be on QUEUE_NONE. */ void bundirty(bp) struct buf *bp; { KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp, bp->b_vp); --numdirtybuffers; } } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf * bp) { bp->b_flags |= B_ASYNC; (void) VOP_BWRITE(bp->b_vp, bp); } /* * bowrite: * * Ordered write. Start output on a buffer, and flag it so that the * device will write it in the order it was queued. The buffer is * released when the output completes. bwrite() ( or the VOP routine * anyway ) is responsible for handling B_INVAL buffers. */ int bowrite(struct buf * bp) { bp->b_flags |= B_ORDERED | B_ASYNC; return (VOP_BWRITE(bp->b_vp, bp)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf * bp) { int s; KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); #if 0 if (bp->b_flags & B_CLUSTER) { relpbuf(bp, NULL); return; } #endif s = splbio(); if (bp->b_flags & B_LOCKED) bp->b_flags &= ~B_ERROR; if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) { /* * Failed write, redirty. Must clear B_ERROR to prevent * pages from being scrapped. Note: B_INVAL is ignored * here but will presumably be dealt with later. */ bp->b_flags &= ~B_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); if (bp->b_flags & B_DELWRI) --numdirtybuffers; bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If B_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. B_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; struct vnode *vp; vp = bp->b_vp; /* * Get the base offset and length of the buffer. Note that * for block sizes that are less then PAGE_SIZE, the b_data * base of the buffer does not represent exactly b_offset and * neither b_offset nor b_size are necessarily page aligned. * Instead, the starting position of b_offset is: * * b_data + (b_offset & PAGE_MASK) * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if (m == bogus_page) { obj = (vm_object_t) vp->v_object; poff = OFF_TO_IDX(bp->b_offset); for (j = i; j < bp->b_npages; j++) { m = bp->b_pages[j]; if (m == bogus_page) { m = vm_page_lookup(obj, poff + j); #if !defined(MAX_PERF) if (!m) { panic("brelse: page missing\n"); } #endif bp->b_pages[j] = m; } } if ((bp->b_flags & B_INVAL) == 0) { pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } if (bp->b_flags & (B_NOCACHE|B_ERROR)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_set_invalid(m, poffset, presid); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("brelse: free buffer onto another queue???"); #endif if (BUF_REFCNT(bp) > 1) { /* Temporary panic to verify exclusive locking */ /* This panic goes away when we allow shared refs */ panic("brelse: multiple refs"); /* do not release to free list */ BUF_UNLOCK(bp); splx(s); return; } /* enqueue */ /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_flags |= B_INVAL; - bp->b_qindex = QUEUE_EMPTY; - TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + if (bp->b_kvasize) + bp->b_qindex = QUEUE_EMPTYKVA; + else + bp->b_qindex = QUEUE_EMPTY; + TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; kvafreespace += bp->b_kvasize; if (bp->b_kvasize) kvaspacewakeup(); /* buffers with junk contents */ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { bp->b_flags |= B_INVAL; - bp->b_qindex = QUEUE_AGE; - TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); bp->b_dev = NODEV; /* buffers that are locked */ } else if (bp->b_flags & B_LOCKED) { bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); - /* buffers with stale but valid contents */ - } else if (bp->b_flags & B_AGE) { - bp->b_qindex = QUEUE_AGE; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); - - /* buffers with valid and quite potentially reuseable contents */ + /* remaining buffers */ } else { - bp->b_qindex = QUEUE_LRU; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + switch(bp->b_flags & (B_DELWRI|B_AGE)) { + case B_DELWRI | B_AGE: + bp->b_qindex = QUEUE_DIRTY; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist); + break; + case B_DELWRI: + bp->b_qindex = QUEUE_DIRTY; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); + break; + case B_AGE: + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + break; + default: + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); + break; + } } /* - * If B_INVAL, clear B_DELWRI. + * If B_INVAL, clear B_DELWRI. We've already placed the buffer + * on the correct queue. */ if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { bp->b_flags &= ~B_DELWRI; --numdirtybuffers; } runningbufspace -= bp->b_bufsize; /* * Fixup numfreebuffers count. The bp is on an appropriate queue * unless locked. We then bump numfreebuffers if it is not B_DELWRI. * We've already handled the B_INVAL case ( B_DELWRI will be clear * if B_INVAL is set ). */ if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) bufcountwakeup(); /* * Something we can maybe free. */ if (bp->b_bufsize) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } /* * Release a buffer back to the appropriate queue but do not try to free * it. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. */ void bqrelse(struct buf * bp) { int s; s = splbio(); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); #if !defined(MAX_PERF) if (bp->b_qindex != QUEUE_NONE) panic("bqrelse: free buffer onto another queue???"); #endif if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ panic("bqrelse: multiple refs"); BUF_UNLOCK(bp); splx(s); return; } if (bp->b_flags & B_LOCKED) { bp->b_flags &= ~B_ERROR; bp->b_qindex = QUEUE_LOCKED; TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); /* buffers with stale but valid contents */ + } else if (bp->b_flags & B_DELWRI) { + bp->b_qindex = QUEUE_DIRTY; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); } else { - bp->b_qindex = QUEUE_LRU; - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + bp->b_qindex = QUEUE_CLEAN; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); } runningbufspace -= bp->b_bufsize; if ((bp->b_flags & B_LOCKED) == 0 && - ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) - ) { + ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { bufcountwakeup(); } /* * Something we can maybe wakeup */ if (bp->b_bufsize) bufspacewakeup(); /* unlock */ BUF_UNLOCK(bp); bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); splx(s); } static void vfs_vmio_release(bp) struct buf *bp; { int i, s; vm_page_t m; s = splvm(); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_unwire(m, 0); /* * We don't mess with busy pages, it is * the responsibility of the process that * busied the pages to deal with them. */ if ((m->flags & PG_BUSY) || (m->busy != 0)) continue; if (m->wire_count == 0) { vm_page_flag_clear(m, PG_ZERO); /* * Might as well free the page if we can and it has * no valid data. */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); } } } bufspace -= bp->b_bufsize; vmiospace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; splx(s); pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); if (bp->b_bufsize) bufspacewakeup(); bp->b_npages = 0; bp->b_bufsize = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block is currently memory resident. */ struct buf * gbincore(struct vnode * vp, daddr_t blkno) { struct buf *bp; struct bufhashhdr *bh; bh = BUFHASH(vp, blkno); bp = bh->lh_first; /* Search hash chain */ while (bp != NULL) { /* hit */ if (bp->b_vp == vp && bp->b_lblkno == blkno && (bp->b_flags & B_INVAL) == 0) { break; } bp = bp->b_hash.le_next; } return (bp); } /* * this routine implements clustered async writes for * clearing out B_DELWRI buffers... This is much better * than the old way of writing only one buffer at a time. */ int vfs_bio_awrite(struct buf * bp) { int i; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int s; int ncl; struct buf *bpa; int nwritten; int size; int maxcl; s = splbio(); /* * right now we support clustered writing only to regular files, and * then only if our I/O system is not saturated. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; for (i = 1; i < maxcl; i++) { if ((bpa = gbincore(vp, lblkno + i)) && BUF_REFCNT(bpa) == 0 && ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == (B_DELWRI | B_CLUSTEROK)) && (bpa->b_bufsize == size)) { if ((bpa->b_blkno == bpa->b_lblkno) || (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) break; } else { break; } } ncl = i; /* * this is a possible cluster write */ if (ncl != 1) { nwritten = cluster_wbuild(vp, size, lblkno, ncl); splx(s); return nwritten; } } BUF_LOCK(bp, LK_EXCLUSIVE); bremfree(bp); bp->b_flags |= B_ASYNC; splx(s); /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) VOP_BWRITE(bp->b_vp, bp); return nwritten; } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_map is too fragmented ( space reservation fails ) + * If we have to flush dirty buffers ( but we try to avoid this ) * - * We do *not* attempt to flush dirty buffers more then one level deep. - * I.e., if P_FLSINPROG is set we do not flush dirty buffers at all. - * - * If P_FLSINPROG is set, we are allowed to dip into our emergency - * reserve. + * To avoid VFS layer recursion we do not flush dirty buffers ourselves. + * Instead we ask the pageout daemon to do it for us. We attempt to + * avoid piecemeal wakeups of the pageout daemon. */ + + /* + * We fully expect to be able to handle any fragmentation and buffer + * space issues by freeing QUEUE_CLEAN buffers. If this fails, we + * have to wakeup the pageout daemon and ask it to flush some of our + * QUEUE_DIRTY buffers. We have to be careful to prevent a deadlock. + * XXX + */ + static struct buf * getnewbuf(struct vnode *vp, daddr_t blkno, int slpflag, int slptimeo, int size, int maxsize) { struct buf *bp; struct buf *nbp; struct buf *dbp; int outofspace; int nqindex; int defrag = 0; static int newbufcnt = 0; int lastnewbuf = newbufcnt; + ++getnewbufcalls; + --getnewbufrestarts; restart: + ++getnewbufrestarts; + /* * Calculate whether we are out of buffer space. This state is * recalculated on every restart. If we are out of space, we - * have to turn off defragmentation. The outofspace code will - * defragment too, but the looping conditionals will be messed up - * if both outofspace and defrag are on. + * have to turn off defragmentation. Setting defrag to -1 when + * outofspace is positive means "defrag while freeing buffers". + * The looping conditional will be muffed up if defrag is left + * positive when outofspace is positive. */ dbp = NULL; outofspace = 0; if (bufspace >= hibufspace) { - if ((curproc->p_flag & P_FLSINPROG) == 0 || - bufspace >= maxbufspace - ) { + if ((curproc->p_flag & P_BUFEXHAUST) == 0 || + bufspace >= maxbufspace) { outofspace = 1; - defrag = 0; + if (defrag > 0) + defrag = -1; } } /* * defrag state is semi-persistant. 1 means we are flagged for * defragging. -1 means we actually defragged something. */ /* nop */ /* * Setup for scan. If we do not have enough free buffers, - * we setup a degenerate case that falls through the while. + * we setup a degenerate case that immediately fails. Note + * that if we are specially marked process, we are allowed to + * dip into our reserves. * - * If we are in the middle of a flush, we can dip into the - * emergency reserve. + * Normally we want to find an EMPTYKVA buffer. That is, a + * buffer with kva already allocated. If there are no EMPTYKVA + * buffers we back up to the truely EMPTY buffers. When defragging + * we do not bother backing up since we have to locate buffers with + * kva to defrag. If we are out of space we skip both EMPTY and + * EMPTYKVA and dig right into the CLEAN queue. * - * If we are out of space, we skip trying to scan QUEUE_EMPTY - * because those buffers are, well, empty. + * In this manner we avoid scanning unnecessary buffers. It is very + * important for us to do this because the buffer cache is almost + * constantly out of space or in need of defragmentation. */ - if ((curproc->p_flag & P_FLSINPROG) == 0 && + if ((curproc->p_flag & P_BUFEXHAUST) == 0 && numfreebuffers < lofreebuffers) { - nqindex = QUEUE_LRU; + nqindex = QUEUE_CLEAN; nbp = NULL; } else { - nqindex = QUEUE_EMPTY; - if (outofspace || - (nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY])) == NULL) { - nqindex = QUEUE_AGE; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); - if (nbp == NULL) { - nqindex = QUEUE_LRU; - nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); + nqindex = QUEUE_EMPTYKVA; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); + if (nbp == NULL) { + if (defrag <= 0) { + nqindex = QUEUE_EMPTY; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } } + if (outofspace || nbp == NULL) { + nqindex = QUEUE_CLEAN; + nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); + } } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ + if (nbp) + --getnewbufloops; + while ((bp = nbp) != NULL) { int qindex = nqindex; + + ++getnewbufloops; /* * Calculate next bp ( we can only use it if we do not block * or do other fancy things ). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch(qindex) { case QUEUE_EMPTY: - nqindex = QUEUE_AGE; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) + nqindex = QUEUE_EMPTYKVA; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) break; /* fall through */ - case QUEUE_AGE: - nqindex = QUEUE_LRU; - if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) + case QUEUE_EMPTYKVA: + nqindex = QUEUE_CLEAN; + if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) break; /* fall through */ - case QUEUE_LRU: + case QUEUE_CLEAN: /* * nbp is NULL. */ break; } } /* * Sanity Checks */ KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp)); KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); /* - * Here we try to move NON VMIO buffers to the end of the - * LRU queue in order to make VMIO buffers more readily - * freeable. We also try to move buffers with a positive - * usecount to the end. - * - * Note that by moving the bp to the end, we setup a following - * loop. Since we continue to decrement b_usecount this - * is ok and, in fact, desireable. - * - * If we are at the end of the list, we move ourself to the - * same place and need to fixup nbp and nqindex to handle - * the following case. + * Note: we no longer distinguish between VMIO and non-VMIO + * buffers. */ - if ((qindex == QUEUE_LRU) && bp->b_usecount > 0) { - if ((bp->b_flags & B_VMIO) == 0 || - (vmiospace < maxvmiobufspace) - ) { - --bp->b_usecount; - TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); - if (nbp == NULL) { - nqindex = qindex; - nbp = bp; - } - continue; - } - } + KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); /* - * If we come across a delayed write and numdirtybuffers should - * be flushed, try to write it out. Only if P_FLSINPROG is - * not set. We can't afford to recursively stack more then - * one deep due to the possibility of having deep VFS call - * stacks. - * - * Limit the number of dirty buffers we are willing to try - * to recover since it really isn't our job here. + * If we are defragging and the buffer isn't useful for fixing + * that problem we continue. If we are out of space and the + * buffer isn't useful for fixing that problem we continue. */ - if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { - /* - * This is rather complex, but necessary. If we come - * across a B_DELWRI buffer we have to flush it in - * order to use it. We only do this if we absolutely - * need to. We must also protect against too much - * recursion which might run us out of stack due to - * deep VFS call stacks. - * - * In heavy-writing situations, QUEUE_LRU can contain - * a large number of DELWRI buffers at its head. These - * buffers must be moved to the tail if they cannot be - * written async in order to reduce the scanning time - * required to skip past these buffers in later - * getnewbuf() calls. - */ - if ((curproc->p_flag & P_FLSINPROG) || - numdirtybuffers < hidirtybuffers) { - if (qindex == QUEUE_LRU) { - /* - * dbp prevents us from looping forever - * if all bps in QUEUE_LRU are dirty. - */ - if (bp == dbp) { - bp = NULL; - break; - } - if (dbp == NULL) - dbp = TAILQ_LAST(&bufqueues[QUEUE_LRU], bqueues); - TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); - TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); - } - continue; - } - curproc->p_flag |= P_FLSINPROG; - vfs_bio_awrite(bp); - curproc->p_flag &= ~P_FLSINPROG; - goto restart; - } - if (defrag > 0 && bp->b_kvasize == 0) + if (defrag > 0 && bp->b_kvasize == 0) { + ++getnewbufloops1; continue; - if (outofspace > 0 && bp->b_bufsize == 0) + } + if (outofspace > 0 && bp->b_bufsize == 0) { + ++getnewbufloops2; continue; + } /* * Start freeing the bp. This is somewhat involved. nbp - * remains valid only for QUEUE_EMPTY bp's. + * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("getnewbuf: locked buf"); bremfree(bp); - if (qindex == QUEUE_LRU || qindex == QUEUE_AGE) { + if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp) brelvp(bp); } /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. * * Get the rest of the buffer freed up. b_kva* is still * valid after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) (*bioops.io_deallocate)(bp); LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags = 0; bp->b_dev = NODEV; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_usecount = 5; LIST_INIT(&bp->b_dep); /* * Ok, now that we have a free buffer, if we are defragging - * we have to recover the kvaspace. + * we have to recover the kvaspace. If we are out of space + * we have to free the buffer (which we just did), but we + * do not have to recover kva space unless we hit a defrag + * hicup. Being able to avoid freeing the kva space leads + * to a significant reduction in overhead. */ if (defrag > 0) { defrag = -1; bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (outofspace > 0) { outofspace = -1; bp->b_flags |= B_INVAL; - bfreekva(bp); + if (defrag < 0) + bfreekva(bp); brelse(bp); goto restart; } /* * We are done */ break; } /* - * If we exhausted our list, sleep as appropriate. + * If we exhausted our list, sleep as appropriate. We may have to + * wakeup the pageout daemon to write out some dirty buffers. */ if (bp == NULL) { int flags; dosleep: if (defrag > 0) flags = VFS_BIO_NEED_KVASPACE; else if (outofspace > 0) flags = VFS_BIO_NEED_BUFSPACE; else flags = VFS_BIO_NEED_ANY; + /* XXX */ + (void) speedup_syncer(); needsbuffer |= flags; while (needsbuffer & flags) { if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", slptimeo)) return (NULL); } } else { /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. */ vm_offset_t addr = 0; maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; if (maxsize != bp->b_kvasize) { bfreekva(bp); if (vm_map_findspace(buffer_map, - vm_map_min(buffer_map), maxsize, &addr) - ) { + vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. Try * to defragment. */ if (defrag <= 0) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } /* * Uh oh. We couldn't seem to defragment */ bp = NULL; goto dosleep; } } if (addr) { vm_map_insert(buffer_map, NULL, 0, addr, addr + maxsize, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); bp->b_kvabase = (caddr_t) addr; bp->b_kvasize = maxsize; } bp->b_data = bp->b_kvabase; } /* * If we have slept at some point in this process and another * process has managed to allocate a new buffer while we slept, * we have to return NULL so that our caller can recheck to * ensure that the other process did not create an identically * identified buffer to the one we were requesting. We make this * check by incrementing the static int newbufcnt each time we * successfully allocate a new buffer. By saving the value of * newbufcnt in our local lastnewbuf, we can compare newbufcnt * with lastnewbuf to see if any other process managed to * allocate a buffer while we were doing so ourselves. * * Note that bp, if valid, is locked. */ if (lastnewbuf == newbufcnt) { /* * No buffers allocated, so we can return one if we were * successful, or continue trying if we were not successful. */ if (bp != NULL) { newbufcnt += 1; return (bp); } goto restart; } /* * Another process allocated a buffer since we were called, so * we have to free the one we allocated and return NULL to let * our caller recheck to see if a new buffer is still needed. */ if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } return (NULL); } /* * waitfreebuffers: * - * Wait for sufficient free buffers. This routine is not called if - * curproc is the update process so we do not have to do anything - * fancy. + * Wait for sufficient free buffers. Only called from normal processes. */ static void waitfreebuffers(int slpflag, int slptimeo) { while (numfreebuffers < hifreebuffers) { - flushdirtybuffers(slpflag, slptimeo); + bd_wakeup(0); if (numfreebuffers >= hifreebuffers) break; needsbuffer |= VFS_BIO_NEED_FREE; if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) break; } } /* - * flushdirtybuffers: - * - * This routine is called when we get too many dirty buffers. + * buf_daemon: * - * We have to protect ourselves from recursion, but we also do not want - * other process's flushdirtybuffers() to interfere with the syncer if - * it decides to flushdirtybuffers(). - * - * In order to maximize operations, we allow any process to flush - * dirty buffers and use P_FLSINPROG to prevent recursion. + * buffer flushing daemon. Buffers are normally flushed by the + * update daemon but if it cannot keep up this process starts to + * take the load in an attempt to prevent getnewbuf() from blocking. */ +static struct proc *bufdaemonproc; +static int bd_interval; +static int bd_flushto; + +static struct kproc_desc buf_kp = { + "bufdaemon", + buf_daemon, + &bufdaemonproc +}; +SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) + static void -flushdirtybuffers(int slpflag, int slptimeo) +buf_daemon() { int s; - + /* + * This process is allowed to take the buffer cache to the limit + */ + curproc->p_flag |= P_BUFEXHAUST; s = splbio(); - if (curproc->p_flag & P_FLSINPROG) { - splx(s); - return; - } - curproc->p_flag |= P_FLSINPROG; + bd_interval = 5 * hz; /* dynamically adjusted */ + bd_flushto = hidirtybuffers; /* dynamically adjusted */ - while (numdirtybuffers > lodirtybuffers) { - if (flushbufqueues() == 0) - break; - } + while (TRUE) { + bd_request = 0; - curproc->p_flag &= ~P_FLSINPROG; + /* + * Do the flush. + */ + { + while (numdirtybuffers > bd_flushto) { + if (flushbufqueues() == 0) + break; + } + } - splx(s); + /* + * Whew. If nobody is requesting anything we sleep until the + * next event. If we sleep and the sleep times out and + * nobody is waiting for interesting things we back-off. + * Otherwise we get more aggressive. + */ + + if (bd_request == 0 && + tsleep(&bd_request, PVM, "psleep", bd_interval) && + needsbuffer == 0) { + /* + * timed out and nothing serious going on, + * increase the flushto high water mark to reduce + * the flush rate. + */ + bd_flushto += 10; + } else { + /* + * We were woken up or hit a serious wall that needs + * to be addressed. + */ + bd_flushto -= 10; + if (needsbuffer) { + int middb = (lodirtybuffers+hidirtybuffers)/2; + bd_interval >>= 1; + if (bd_flushto > middb) + bd_flushto = middb; + } + } + if (bd_flushto < lodirtybuffers) { + bd_flushto = lodirtybuffers; + bd_interval -= hz / 10; + } + if (bd_flushto > hidirtybuffers) { + bd_flushto = hidirtybuffers; + bd_interval += hz / 10; + } + if (bd_interval < hz / 10) + bd_interval = hz / 10; + + if (bd_interval > 5 * hz) + bd_interval = 5 * hz; + } } static int flushbufqueues(void) { struct buf *bp; - int qindex; int r = 0; - qindex = QUEUE_AGE; - bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); - - for (;;) { - if (bp == NULL) { - if (qindex == QUEUE_LRU) - break; - qindex = QUEUE_LRU; - if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])) == NULL) - break; - } + bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); + while (bp) { /* * Try to free up B_INVAL delayed-write buffers rather then * writing them out. Note also that NFS is somewhat sensitive * to B_INVAL buffers so it is doubly important that we do * this. + * + * We do not try to sync buffers whos vnodes are locked, we + * cannot afford to block in this process. */ + KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); if ((bp->b_flags & B_DELWRI) != 0) { if (bp->b_flags & B_INVAL) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) panic("flushbufqueues: locked buf"); bremfree(bp); brelse(bp); - } else { + ++r; + break; + } + if (!VOP_ISLOCKED(bp->b_vp)) { vfs_bio_awrite(bp); + ++r; + break; } - ++r; - break; } bp = TAILQ_NEXT(bp, b_freelist); } return(r); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct vnode * vp, daddr_t blkno) { struct buf *bp; int s = splbio(); bp = gbincore(vp, blkno); splx(s); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; if (incore(vp, blkno)) return 1; if (vp->v_mount == NULL) return 0; if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) return 0; obj = vp->v_object; size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) return 0; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) return 0; } return 1; } /* * vfs_setdirty: * * Sets the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. * * The range is limited to the size of the buffer. * * This routine is primarily used by NFS, but is generalized for the * B_VMIO case. */ static void vfs_setdirty(struct buf *bp) { int i; vm_object_t object; /* * Degenerate case - empty buffer */ if (bp->b_bufsize == 0) return; /* * We qualify the scan for modified pages on whether the * object has been flushed yet. The OBJ_WRITEABLE flag * is not cleared simply by protecting pages off. */ if ((bp->b_flags & B_VMIO) == 0) return; object = bp->b_pages[0]->object; if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p writeable but not mightbedirty\n", object); if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) printf("Warning: object %p mightbedirty but not writeable\n", object); if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) { vm_page_flag_clear(bp->b_pages[i], PG_ZERO); vm_page_test_dirty(bp->b_pages[i]); } /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and B_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; struct bufhashhdr *bh; #if !defined(MAX_PERF) if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); #endif s = splbio(); loop: /* - * Block if we are low on buffers. The syncer is allowed more - * buffers in order to avoid a deadlock. + * Block if we are low on buffers. Certain processes are allowed + * to completely exhaust the buffer cache. */ - if (curproc == updateproc && numfreebuffers == 0) { - needsbuffer |= VFS_BIO_NEED_ANY; - tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", - slptimeo); - } else if (curproc != updateproc && numfreebuffers < lofreebuffers) { + if (curproc->p_flag & P_BUFEXHAUST) { + if (numfreebuffers == 0) { + needsbuffer |= VFS_BIO_NEED_ANY; + tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", + slptimeo); + } + } else if (numfreebuffers < lofreebuffers) { waitfreebuffers(slpflag, slptimeo); } if ((bp = gbincore(vp, blkno))) { /* - * Buffer is in-core + * Buffer is in-core. If the buffer is not busy, it must + * be on a queue. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "getblk", slpflag, slptimeo) == ENOLCK) goto loop; splx(s); return (struct buf *) NULL; } /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; bremfree(bp); /* * check for size inconsistancies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || - (size > bp->b_kvasize) - ) { + (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp->b_vp, bp); } else { if ((bp->b_flags & B_VMIO) && (LIST_FIRST(&bp->b_dep) == NULL)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; VOP_BWRITE(bp->b_vp, bp); } } goto loop; } } /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { VOP_BWRITE(bp->b_vp, bp); goto loop; } if (bp->b_usecount < BUF_MAXUSE) ++bp->b_usecount; splx(s); bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ int bsize, maxsize, vmio; off_t offset; if (vp->v_type == VBLK) bsize = DEV_BSIZE; else if (vp->v_mountedhere) bsize = vp->v_mountedhere->mnt_stat.f_iosize; else if (vp->v_mount) bsize = vp->v_mount->mnt_stat.f_iosize; else bsize = size; offset = (off_t)blkno * bsize; vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF); maxsize = vmio ? size + (offset & PAGE_MASK) : size; maxsize = imax(maxsize, bsize); if ((bp = getnewbuf(vp, blkno, slpflag, slptimeo, size, maxsize)) == NULL) { if (slpflag || slptimeo) { splx(s); return NULL; } goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. There is now window * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); LIST_REMOVE(bp, b_hash); bh = BUFHASH(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; #if defined(VFS_BIO_DEBUG) if (vp->v_type != VREG && vp->v_type != VBLK) printf("getblk: vmioing file type %d???\n", vp->v_type); #endif } else { bp->b_flags &= ~B_VMIO; } allocbuf(bp, size); splx(s); bp->b_flags &= ~B_DONE; } return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size) { struct buf *bp; int s; s = splbio(); while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0); splx(s); allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; #if !defined(MAX_PERF) if (BUF_REFCNT(bp) == 0) panic("allocbuf: buffer not busy"); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); #endif if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) newbsize = mbsize; else #endif newbsize = round_page(size); if (newbsize < bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; if (bp->b_bufsize) bufspacewakeup(); bp->b_data = bp->b_kvabase; bp->b_bufsize = 0; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } #endif vm_hold_free_pages( bp, (vm_offset_t) bp->b_data + newbsize, (vm_offset_t) bp->b_data + bp->b_bufsize); } else if (newbsize > bp->b_bufsize) { #if !defined(NO_B_MALLOC) /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer grows. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; bufspace += mbsize; bufmallocspace += mbsize; runningbufspace += bp->b_bufsize; return 1; } #endif origbuf = NULL; origbufsize = 0; #if !defined(NO_B_MALLOC) /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufspace -= bp->b_bufsize; bufmallocspace -= bp->b_bufsize; runningbufspace -= bp->b_bufsize; if (bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = 0; bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } #endif vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); #if !defined(NO_B_MALLOC) if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } #endif } } else { vm_page_t m; int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); #if !defined(NO_B_MALLOC) if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); #endif /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_busy(m, TRUE, "biodep")) ; bp->b_pages[i] = NULL; vm_page_unwire(m, 0); } pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ struct vnode *vp; vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ vp = bp->b_vp; obj = vp->v_object; while (bp->b_npages < desiredpages) { vm_page_t m; vm_pindex_t pi; pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; if ((m = vm_page_lookup(obj, pi)) == NULL) { m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); if (m == NULL) { VM_WAIT; vm_pageout_deficit += desiredpages - bp->b_npages; } else { vm_page_wire(m); vm_page_wakeup(m); bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } continue; } /* * We found a page. If we have to sleep on it, * retry because it might have gotten freed out * from under us. * * We can only test PG_BUSY here. Blocking on * m->busy might lead to a deadlock: * * vm_fault->getpages->cluster_read->allocbuf * */ if (vm_page_sleep_busy(m, FALSE, "pgtblk")) continue; /* * We have a good page. Should we wakeup the * page daemon? */ if ((curproc != pageproc) && ((m->queue - m->pc) == PQ_CACHE) && ((cnt.v_free_count + cnt.v_cache_count) < - (cnt.v_free_min + cnt.v_cache_min)) - ) { + (cnt.v_free_min + cnt.v_cache_min))) { pagedaemon_wakeup(); } vm_page_flag_clear(m, PG_ZERO); vm_page_wire(m); bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVM pmap. Remember that * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); pmap_qenter( (vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages ); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } } if (bp->b_flags & B_VMIO) vmiospace += (newbsize - bp->b_bufsize); bufspace += (newbsize - bp->b_bufsize); runningbufspace += (newbsize - bp->b_bufsize); if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } /* * biowait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into a EINTR * error and cleared. */ int biowait(register struct buf * bp) { int s; s = splbio(); - while ((bp->b_flags & B_DONE) == 0) + while ((bp->b_flags & B_DONE) == 0) { #if defined(NO_SCHEDULE_MODS) tsleep(bp, PRIBIO, "biowait", 0); #else if (bp->b_flags & B_READ) tsleep(bp, PRIBIO, "biord", 0); else tsleep(bp, PRIBIO, "biowr", 0); #endif + } splx(s); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_flags & B_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * biodone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void biodone(register struct buf * bp) { int s; s = splbio(); - KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy", bp)); + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); bp->b_flags |= B_DONE; if (bp->b_flags & B_FREEBUF) { brelse(bp); splx(s); return; } if ((bp->b_flags & B_READ) == 0) { vwakeup(bp); } /* call optional completion function if requested */ if (bp->b_flags & B_CALL) { bp->b_flags &= ~B_CALL; (*bp->b_iodone) (bp); splx(s); return; } if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) (*bioops.io_complete)(bp); if (bp->b_flags & B_VMIO) { int i, resid; vm_ooffset_t foff; vm_page_t m; vm_object_t obj; int iosize; struct vnode *vp = bp->b_vp; obj = vp->v_object; #if defined(VFS_BIO_DEBUG) if (vp->v_usecount == 0) { panic("biodone: zero vnode ref count"); } if (vp->v_object == NULL) { panic("biodone: missing VM object"); } if ((vp->v_flag & VOBJBUF) == 0) { panic("biodone: vnode is not setup for merged cache"); } #endif foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone: no buffer offset")); #if !defined(MAX_PERF) if (!obj) { panic("biodone: no object"); } #endif #if defined(VFS_BIO_DEBUG) if (obj->paging_in_progress < bp->b_npages) { printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", obj->paging_in_progress, bp->b_npages); } #endif /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount; if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) { bp->b_flags |= B_CACHE; } for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; m = bp->b_pages[i]; if (m == bogus_page) { bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (!m) { #if defined(VFS_BIO_DEBUG) printf("biodone: page disappeared\n"); #endif vm_object_pip_subtract(obj, 1); bp->b_flags &= ~B_CACHE; continue; } bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { printf( "biodone: foff(%lu)/m->pindex(%d) mismatch\n", (unsigned long)foff, m->pindex); } #endif resid = IDX_TO_OFF(m->pindex + 1) - foff; if (resid > iosize) resid = iosize; /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { vfs_page_set_valid(bp, foff, i, m); } vm_page_flag_clear(m, PG_ZERO); /* * when debugging new filesystems or buffer I/O methods, this * is the most common error that pops up. if you see this, you * have not set the page busy flag correctly!!! */ if (m->busy == 0) { #if !defined(MAX_PERF) printf("biodone: page busy < 0, " "pindex: %d, foff: 0x(%x,%x), " "resid: %d, index: %d\n", (int) m->pindex, (int)(foff >> 32), (int) foff & 0xffffffff, resid, i); #endif if (vp->v_type != VBLK) #if !defined(MAX_PERF) printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", bp->b_vp->v_mount->mnt_stat.f_iosize, (int) bp->b_lblkno, bp->b_flags, bp->b_npages); else printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", (int) bp->b_lblkno, bp->b_flags, bp->b_npages); printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", m->valid, m->dirty, m->wire_count); #endif panic("biodone: page busy < 0\n"); } vm_page_io_finish(m); vm_object_pip_subtract(obj, 1); foff += resid; iosize -= resid; } if (obj) vm_object_pip_wakeupn(obj, 0); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) brelse(bp); else bqrelse(bp); } else { wakeup(bp); } splx(s); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); #if !defined(MAX_PERF) if (!m) { panic("vfs_unbusy_pages: page missing\n"); } #endif bp->b_pages[i] = m; pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } vm_object_pip_subtract(obj, 1); vm_page_flag_clear(m, PG_ZERO); vm_page_io_finish(m); } vm_object_pip_wakeupn(obj, 0); } } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being PG_BUSY. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as B_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf * bp, int clear_modify) { int i, bogus; if (bp->b_flags & B_VMIO) { struct vnode *vp = bp->b_vp; vm_object_t obj = vp->v_object; vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); vfs_setdirty(bp); retry: for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; if (vm_page_sleep_busy(m, FALSE, "vbpage")) goto retry; } bogus = 0; for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_page_flag_clear(m, PG_ZERO); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_io_start(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ vm_page_protect(m, VM_PROT_NONE); if (clear_modify) vfs_page_set_valid(bp, foff, i, m); else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~PAGE_MASK; } if (bogus) pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages(struct buf * bp) { int i; if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages: no buffer offset")); for (i = 0; i < bp->b_npages; i++) { vm_page_t m = bp->b_pages[i]; vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; vm_ooffset_t eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; vfs_page_set_valid(bp, foff, i, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } } } /* * vfs_bio_set_validclean: * * Set the range within the buffer to valid and clean. The range is * relative to the beginning of the buffer, b_offset. Note that b_offset * itself may be offset from the beginning of the first page. */ void vfs_bio_set_validclean(struct buf *bp, int base, int size) { if (bp->b_flags & B_VMIO) { int i; int n; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_validclean(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vfs_bio_clrbuf: * * clear a buffer. This routine essentially fakes an I/O, so we need * to clear B_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, mask = 0; caddr_t sa, ea; if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { bp->b_flags &= ~(B_INVAL|B_ERROR); if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && ((bp->b_pages[0]->valid & mask) != mask)) { bzero(bp->b_data, bp->b_bufsize); } bp->b_pages[0]->valid |= mask; bp->b_resid = 0; return; } ea = sa = bp->b_data; for(i=0;ib_npages;i++,sa=ea) { int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE; ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); ea = (caddr_t)ulmin((u_long)ea, (u_long)bp->b_data + bp->b_bufsize); mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) { if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { bzero(sa, ea - sa); } } else { for (; sa < ea; sa += DEV_BSIZE, j++) { if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && (bp->b_pages[i]->valid & (1<b_pages[i]->valid |= mask; vm_page_flag_clear(bp->b_pages[i], PG_ZERO); } bp->b_resid = 0; } else { clrbuf(bp); } } /* * vm_hold_load_pages and vm_hold_unload pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), VM_ALLOC_NORMAL); if (!p) { vm_pageout_deficit += (to - from) >> PAGE_SHIFT; VM_WAIT; goto tryagain; } vm_page_wire(p); p->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(p, PG_ZERO); pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); bp->b_pages[index] = p; vm_page_wakeup(p); } bp->b_npages = index; } void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index, newnpages; from = round_page(from); to = round_page(to); newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { p = bp->b_pages[index]; if (p && (index < bp->b_npages)) { #if !defined(MAX_PERF) if (p->busy) { printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", bp->b_blkno, bp->b_lblkno); } #endif bp->b_pages[index] = NULL; pmap_kremove(pg); vm_page_busy(p); vm_page_unwire(p, 0); vm_page_free(p); } } bp->b_npages = newnpages; } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, " "b_blkno = %d, b_pblkno = %d\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, major(bp->b_dev), minor(bp->b_dev), bp->b_data, bp->b_blkno, bp->b_pblkno); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } } #endif /* DDB */ diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 8357241479e2..efca6c8a1578 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -1,2945 +1,2977 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.204 1999/07/01 13:21:41 peter Exp $ + * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); static void vfree __P((struct vnode *)); static void vgonel __P((struct vnode *vp, struct proc *p)); static unsigned long numvnodes; SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ struct tobefreelist vnode_tobefree_list; /* vnode free list */ static u_long wantfreevnodes = 25; SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); static u_long freevnodes = 0; SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); +static int reassignbufcalls; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); +static int reassignbufloops; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); +static int reassignbufsortgood; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); +static int reassignbufsortbad; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); +static int reassignbufmethod = 1; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); + int vfs_ioopt = 0; #ifdef ENABLE_VFS_IOOPT SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif struct mntlist mountlist; /* mounted filesystem list */ struct simplelock mountlist_slock; struct simplelock mntvnode_slock; int nfs_mount_type = -1; #ifndef NULL_SIMPLELOCKS static struct simplelock mntid_slock; static struct simplelock vnode_free_list_slock; static struct simplelock spechash_slock; #endif struct nfs_public nfs_pub; /* publicly exported FS */ static vm_zone_t vnode_zone; /* * The workitem queue. */ #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + cnt.v_page_count / 4; simple_lock_init(&mntvnode_slock); simple_lock_init(&mntid_slock); simple_lock_init(&spechash_slock); TAILQ_INIT(&vnode_free_list); TAILQ_INIT(&vnode_tobefree_list); simple_lock_init(&vnode_free_list_slock); CIRCLEQ_INIT(&mountlist); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct simplelock *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; if (interlkp) { simple_unlock(interlkp); } /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); if (interlkp) { simple_lock(interlkp); } return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); return (mp); } } simple_unlock(&mountlist_slock); return ((struct mount *) 0); } /* * Get a new unique fsid */ void vfs_getnewfsid(mp) struct mount *mp; { static u_short xxxfs_mntid; fsid_t tfsid; int mtype; simple_lock(&mntid_slock); mtype = mp->mnt_vfc->vfc_typenum; mp->mnt_stat.f_fsid.val[0] = umakedev(255, mtype); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16)); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (vfs_getvfs(&tfsid)) { xxxfs_mntid++; tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16)); } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; simple_unlock(&mntid_slock); } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s; struct proc *p = curproc; /* XXX */ struct vnode *vp, *tvp, *nvp; vm_object_t object; TAILQ_HEAD(freelst, vnode) vnode_tmp_list; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); simple_lock(&vnode_free_list_slock); TAILQ_INIT(&vnode_tmp_list); for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } vp->v_flag &= ~(VTBFREE|VAGE); vp->v_flag |= VFREE; if (vp->v_usecount) panic("tobe free vnode isn't"); freevnodes++; } if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else { for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); if (!simple_lock_try(&vp->v_interlock)) continue; if (vp->v_usecount) panic("free vnode isn't"); object = vp->v_object; if (object && (object->resident_page_count || object->ref_count)) { printf("object inconsistant state: RPC: %d, RC: %d\n", object->resident_page_count, object->ref_count); /* Don't recycle if it's caching some pages */ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); continue; } else if (LIST_FIRST(&vp->v_cache_src)) { /* Don't recycle if active in the namecache */ simple_unlock(&vp->v_interlock); continue; } else { break; } } } for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { nvp = TAILQ_NEXT(tvp, v_freelist); TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); simple_unlock(&tvp->v_interlock); } if (vp) { vp->v_flag |= VDOOMED; TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; simple_unlock(&vnode_free_list_slock); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { simple_unlock(&vp->v_interlock); } #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ vp->v_maxio = 0; } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); simple_lock_init(&vp->v_interlock); vp->v_dd = vp; cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { simple_lock(&mntvnode_slock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { simple_unlock(&mntvnode_slock); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); simple_unlock(&mntvnode_slock); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; VOP_BWRITE(bp->b_vp, bp); } } else { bremfree(bp); (void) VOP_BWRITE(bp->b_vp, bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ simple_lock(&vp->v_interlock); object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } simple_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } VOP_BWRITE(bp->b_vp, bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= B_VNCLEAN; bp->b_xflags &= ~B_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syner process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; long starttime; int s; struct proc *p = updateproc; + p->p_flag |= P_BUFEXHAUST; + for (;;) { starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ if (bioops.io_sync) (*bioops.io_sync)(NULL); /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { int s; s = splhigh(); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); splx(s); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); #if !defined(MAX_PERF) /* XXX REMOVE ME */ if (bp->b_vnbufs.tqe_next != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } #endif bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { #if !defined(MAX_PERF) if ((bp->b_flags & B_PAGING) == 0) { panic( "pbreassignbuf() on non phys bp %p", bp ); } #endif bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } + ++reassignbufcalls; #if !defined(MAX_PERF) /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); #endif s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VBLK: if (newvp->v_specmountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= B_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || - (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) { + bp->b_lblkno == 0 || + (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); - } else { - if (bp->b_lblkno >= 0) { - struct buf *ttbp; - while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && - (ttbp->b_lblkno < bp->b_lblkno)) { - tbp = ttbp; - } + ++reassignbufsortgood; + } else if (bp->b_lblkno < 0) { + TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + ++reassignbufsortgood; + } else if (reassignbufmethod == 1) { + /* + * New sorting algorithm, only handle sequential case, + * otherwise guess. + */ + if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && + (tbp->b_xflags & B_VNDIRTY)) { TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + ++reassignbufsortgood; } else { - TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); + ++reassignbufsortbad; + } + } else { + /* + * Old sorting algorithm, scan queue and insert + */ + struct buf *ttbp; + while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && + (ttbp->b_lblkno < bp->b_lblkno)) { + ++reassignbufloops; + tbp = ttbp; } + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= B_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a block device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VBLK; if ((nvp = checkalias(vp, dev2udev(dev), (struct mount *)0)) != NULL) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; udev_t nvp_rdev; struct mount *mp; { struct proc *p = curproc; /* XXX */ struct vnode *vp; struct vnode **vpp; dev_t dev; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); dev = udev2dev(nvp_rdev, 2); vpp = &speclisth[SPECHASH(dev)]; loop: simple_lock(&spechash_slock); for (vp = *vpp; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. * Only alias active device nodes. * Not sure why we don't re-use this like we do below. */ simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { simple_unlock(&spechash_slock); vgonel(vp, p); goto loop; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { /* * It dissappeared, and we may have slept. * Restart from the beginning */ simple_unlock(&spechash_slock); goto loop; } break; } /* * It would be a lot clearer what is going on here if * this had been expressed as: * if ( vp && (vp->v_tag == VT_NULL)) * and the clauses had been swapped. */ if (vp == NULL || vp->v_tag != VT_NON) { struct specinfo *sinfo; /* * Put the new vnode into the hash chain. * and if there was an alias, connect them. */ MALLOC(sinfo, struct specinfo *, sizeof(struct specinfo), M_VNODE, M_WAITOK); bzero(sinfo, sizeof(struct specinfo)); nvp->v_specinfo = sinfo; sinfo->si_rdev = dev; sinfo->si_hashchain = vpp; sinfo->si_specnext = *vpp; sinfo->si_bsize_phys = DEV_BSIZE; sinfo->si_bsize_best = BLKDEV_IOSIZE; sinfo->si_bsize_max = MAXBSIZE; /* * Ask the device to fix up specinfo. Typically the * si_bsize_* parameters may need fixing up. */ if (nvp->v_type == VBLK) { if (bdevsw(dev) && bdevsw(dev)->d_parms) (*bdevsw(dev)->d_parms)(dev, sinfo, DPARM_GET); } else if (nvp->v_type == VCHR) { if (devsw(dev) && devsw(dev)->d_parms) (*devsw(dev)->d_parms)(dev, sinfo, DPARM_GET); } simple_unlock(&spechash_slock); *vpp = nvp; if (vp != NULLVP) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } /* * if ( vp && (vp->v_tag == VT_NULL)) * We have a vnode alias, but it is a trashed. * Make it look like it's newley allocated. (by getnewvnode()) * The caller should use this instead. */ simple_unlock(&spechash_slock); VOP_UNLOCK(vp, 0, p); simple_lock(&vp->v_interlock); vclean(vp, 0, p); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vget", 0); return (ENOENT); } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ simple_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); simple_unlock(&vp->v_interlock); } return (error); } simple_unlock(&vp->v_interlock); return (0); } void vref(struct vnode *vp) { simple_lock(&vp->v_interlock); vp->v_usecount++; simple_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; simple_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); simple_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * One less who cares about this vnode. */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; simple_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { simple_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { simple_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { simple_unlock(&mntvnode_slock); vgonel(vp, p); simple_lock(&mntvnode_slock); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { simple_unlock(&mntvnode_slock); if (vp->v_type != VBLK && vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } simple_lock(&mntvnode_slock); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif simple_unlock(&vp->v_interlock); busy++; } simple_unlock(&mntvnode_slock); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; vm_object_t obj; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. */ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); if ((obj = vp->v_object) != NULL) { if (obj->ref_count == 0) { /* * This is a normal way of shutting down the object/vnode * association. */ vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); } } /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) vrele(vp); cache_purge(vp); if (vp->v_vnlock) { #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ #ifdef DIAGNOSTIC if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); #endif #endif FREE(vp->v_vnlock, M_VNODE); vp->v_vnlock = NULL; } if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; struct proc *p = curproc; /* XXX */ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); return (0); } /* * Ensure that vp will not be vgone'd while we * are eliminating its aliases. */ vp->v_flag |= VXLOCK; simple_unlock(&vp->v_interlock); while (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq) continue; simple_unlock(&spechash_slock); vgone(vq); break; } if (vq == NULLVP) { simple_unlock(&spechash_slock); } } /* * Remove the lock so that vgone below will * really eliminate the vnode after which time * vgone will awaken any sleepers. */ simple_lock(&vp->v_interlock); vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup(vp); } } vgonel(vp, p); return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct simplelock *inter_lkp; struct proc *p; { simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { simple_unlock(inter_lkp); } vgonel(vp, p); return (1); } simple_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ simple_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ static void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); simple_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { simple_lock(&spechash_slock); if (*vp->v_hashchain == vp) { *vp->v_hashchain = vp->v_specnext; } else { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } simple_unlock(&spechash_slock); FREE(vp->v_specinfo, M_VNODE); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the back * pointer and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VFREE) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); } else if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; freevnodes++; } else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); splx(s); } vp->v_type = VBAD; simple_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; int rc = 0; simple_lock(&spechash_slock); for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || type != vp->v_type) continue; *vpp = vp; rc = 1; break; } simple_unlock(&spechash_slock); return (rc); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); simple_lock(&spechash_slock); for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { simple_unlock(&spechash_slock); vgone(vq); goto loop; } count += vq->v_usecount; } simple_unlock(&spechash_slock); return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { if (VOP_ISLOCKED(vp)) vprint((char *)0, vp); } simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); static int vfs_sysctl SYSCTL_HANDLER_ARGS { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf SYSCTL_HANDLER_ARGS { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if 0 #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } again: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { simple_unlock(&mntvnode_slock); goto again; } nvp = vp->v_mntvnodes.le_next; simple_unlock(&mntvnode_slock); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); return (0); } #endif /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ #if 0 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { struct vnode *vq; int error = 0; if (vp->v_specmountpoint != NULL) return (EBUSY); if (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vq->v_specmountpoint != NULL) { error = EBUSY; break; } } simple_unlock(&spechash_slock); } return (error); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp, *nmp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; error = dounmount(mp, MNT_FORCE, p); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { nvp = vp->v_mntvnodes.le_next; if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { obj = vp->v_object; if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp)) continue; } simple_lock(&vp->v_interlock); if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); anyio = 1; } vput(vp); } } else { simple_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { struct vattr vat; vm_object_t object; int error = 0; if ((vp->v_type != VREG) && (vp->v_type != VBLK)) return 0; retry: if ((object = vp->v_object) == NULL) { if (vp->v_type == VREG) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) goto retn; object = vnode_pager_alloc(vp, vat.va_size, 0, 0); } else if (bdevsw(vp->v_rdev) != NULL) { /* * This simply allocates the biggest object possible * for a VBLK vnode. This should be fixed, but doesn't * cause any problems (yet). */ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); } else { goto retn; } /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ object->ref_count--; vp->v_usecount--; } else { if (object->flags & OBJ_DEAD) { VOP_UNLOCK(vp, 0, p); tsleep(object, PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); goto retry; } } KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); vp->v_flag |= VOBJBUF; retn: return error; } static void vfree(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } void vbusy(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; simple_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); simple_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ simple_lock(&mountlist_slock); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { simple_unlock(&mountlist_slock); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 8357241479e2..efca6c8a1578 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,2945 +1,2977 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 - * $Id: vfs_subr.c,v 1.204 1999/07/01 13:21:41 peter Exp $ + * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $ */ /* * External virtual filesystem routines */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void insmntque __P((struct vnode *vp, struct mount *mp)); static void vclean __P((struct vnode *vp, int flags, struct proc *p)); static void vfree __P((struct vnode *)); static void vgonel __P((struct vnode *vp, struct proc *p)); static unsigned long numvnodes; SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ struct tobefreelist vnode_tobefree_list; /* vnode free list */ static u_long wantfreevnodes = 25; SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); static u_long freevnodes = 0; SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); +static int reassignbufcalls; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); +static int reassignbufloops; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); +static int reassignbufsortgood; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); +static int reassignbufsortbad; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); +static int reassignbufmethod = 1; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); + int vfs_ioopt = 0; #ifdef ENABLE_VFS_IOOPT SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); #endif struct mntlist mountlist; /* mounted filesystem list */ struct simplelock mountlist_slock; struct simplelock mntvnode_slock; int nfs_mount_type = -1; #ifndef NULL_SIMPLELOCKS static struct simplelock mntid_slock; static struct simplelock vnode_free_list_slock; static struct simplelock spechash_slock; #endif struct nfs_public nfs_pub; /* publicly exported FS */ static vm_zone_t vnode_zone; /* * The workitem queue. */ #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); time_t dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); static int syncer_delayno = 0; static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); static void vfs_free_addrlist __P((struct netexport *nep)); static int vfs_free_netcred __P((struct radix_node *rn, void *w)); static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, struct export_args *argp)); /* * Initialize the vnode management data structures. */ void vntblinit() { desiredvnodes = maxproc + cnt.v_page_count / 4; simple_lock_init(&mntvnode_slock); simple_lock_init(&mntid_slock); simple_lock_init(&spechash_slock); TAILQ_INIT(&vnode_free_list); TAILQ_INIT(&vnode_tobefree_list); simple_lock_init(&vnode_free_list_slock); CIRCLEQ_INIT(&mountlist); vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; } /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Interlock is not released on failure. */ int vfs_busy(mp, flags, interlkp, p) struct mount *mp; int flags; struct simplelock *interlkp; struct proc *p; { int lkflags; if (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_kern_flag |= MNTK_MWAIT; if (interlkp) { simple_unlock(interlkp); } /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); if (interlkp) { simple_lock(interlkp); } return (ENOENT); } lkflags = LK_SHARED | LK_NOPAUSE; if (interlkp) lkflags |= LK_INTERLOCK; if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) panic("vfs_busy: unexpected lock failure"); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(mp, p) struct mount *mp; struct proc *p; { lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); } /* * Lookup a filesystem type, and if found allocate and initialize * a mount structure for it. * * Devname is usually updated by mount(8) after booting. */ int vfs_rootmountalloc(fstypename, devname, mpp) char *fstypename; char *devname; struct mount **mpp; { struct proc *p = curproc; /* XXX */ struct vfsconf *vfsp; struct mount *mp; if (fstypename == NULL) return (ENODEV); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (!strcmp(vfsp->vfc_name, fstypename)) break; if (vfsp == NULL) return (ENODEV); mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); bzero((char *)mp, (u_long)sizeof(struct mount)); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); (void)vfs_busy(mp, LK_NOWAIT, 0, p); LIST_INIT(&mp->mnt_vnodelist); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_flag = MNT_RDONLY; mp->mnt_vnodecovered = NULLVP; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); } /* * Find an appropriate filesystem to use for the root. If a filesystem * has not been preselected, walk through the list of known filesystems * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ #ifdef notdef /* XXX JH */ int lite2_vfs_mountroot() { struct vfsconf *vfsp; extern int (*lite2_mountroot) __P((void)); int error; if (lite2_mountroot != NULL) return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; if ((error = (*vfsp->vfc_mountroot)()) == 0) return (0); printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); } return (ENODEV); } #endif /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid) fsid_t *fsid; { register struct mount *mp; simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); return (mp); } } simple_unlock(&mountlist_slock); return ((struct mount *) 0); } /* * Get a new unique fsid */ void vfs_getnewfsid(mp) struct mount *mp; { static u_short xxxfs_mntid; fsid_t tfsid; int mtype; simple_lock(&mntid_slock); mtype = mp->mnt_vfc->vfc_typenum; mp->mnt_stat.f_fsid.val[0] = umakedev(255, mtype); mp->mnt_stat.f_fsid.val[1] = mtype; if (xxxfs_mntid == 0) ++xxxfs_mntid; tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16)); tfsid.val[1] = mtype; if (mountlist.cqh_first != (void *)&mountlist) { while (vfs_getvfs(&tfsid)) { xxxfs_mntid++; tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16)); } } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; simple_unlock(&mntid_slock); } /* * Set vnode attributes to VNOVAL */ void vattr_null(vap) register struct vattr *vap; { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. */ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; vop_t **vops; struct vnode **vpp; { int s; struct proc *p = curproc; /* XXX */ struct vnode *vp, *tvp, *nvp; vm_object_t object; TAILQ_HEAD(freelst, vnode) vnode_tmp_list; /* * We take the least recently used vnode from the freelist * if we can get it and it has no cached pages, and no * namecache entries are relative to it. * Otherwise we allocate a new vnode */ s = splbio(); simple_lock(&vnode_free_list_slock); TAILQ_INIT(&vnode_tmp_list); for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } vp->v_flag &= ~(VTBFREE|VAGE); vp->v_flag |= VFREE; if (vp->v_usecount) panic("tobe free vnode isn't"); freevnodes++; } if (wantfreevnodes && freevnodes < wantfreevnodes) { vp = NULL; } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { /* * XXX: this is only here to be backwards compatible */ vp = NULL; } else { for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { nvp = TAILQ_NEXT(vp, v_freelist); if (!simple_lock_try(&vp->v_interlock)) continue; if (vp->v_usecount) panic("free vnode isn't"); object = vp->v_object; if (object && (object->resident_page_count || object->ref_count)) { printf("object inconsistant state: RPC: %d, RC: %d\n", object->resident_page_count, object->ref_count); /* Don't recycle if it's caching some pages */ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); continue; } else if (LIST_FIRST(&vp->v_cache_src)) { /* Don't recycle if active in the namecache */ simple_unlock(&vp->v_interlock); continue; } else { break; } } } for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { nvp = TAILQ_NEXT(tvp, v_freelist); TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); simple_unlock(&tvp->v_interlock); } if (vp) { vp->v_flag |= VDOOMED; TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; simple_unlock(&vnode_free_list_slock); cache_purge(vp); vp->v_lease = NULL; if (vp->v_type != VBAD) { vgonel(vp, p); } else { simple_unlock(&vp->v_interlock); } #ifdef INVARIANTS { int s; if (vp->v_data) panic("cleaned vnode isn't"); s = splbio(); if (vp->v_numoutput) panic("Clean vnode has pending I/O's"); splx(s); } #endif vp->v_flag = 0; vp->v_lastr = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; vp->v_writecount = 0; /* XXX */ vp->v_maxio = 0; } else { simple_unlock(&vnode_free_list_slock); vp = (struct vnode *) zalloc(vnode_zone); bzero((char *) vp, sizeof *vp); simple_lock_init(&vp->v_interlock); vp->v_dd = vp; cache_purge(vp); LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); numvnodes++; } TAILQ_INIT(&vp->v_cleanblkhd); TAILQ_INIT(&vp->v_dirtyblkhd); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; insmntque(vp, mp); *vpp = vp; vp->v_usecount = 1; vp->v_data = 0; splx(s); vfs_object_create(vp, p, p->p_ucred); return (0); } /* * Move a vnode from one mount queue to another. */ static void insmntque(vp, mp) register struct vnode *vp; register struct mount *mp; { simple_lock(&mntvnode_slock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) LIST_REMOVE(vp, v_mntvnodes); /* * Insert into list of vnodes for the new mount point, if available. */ if ((vp->v_mount = mp) == NULL) { simple_unlock(&mntvnode_slock); return; } LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); simple_unlock(&mntvnode_slock); } /* * Update outstanding I/O count and do wakeup if requested. */ void vwakeup(bp) register struct buf *bp; { register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp)) { vp->v_numoutput--; if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; wakeup((caddr_t) &vp->v_numoutput); } } } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct vnode *vp; int flags; struct ucred *cred; struct proc *p; int slpflag, slptimeo; { register struct buf *bp; struct buf *nbp, *blist; int s, error; vm_object_t object; if (flags & V_SAVE) { s = splbio(); while (vp->v_numoutput) { vp->v_flag |= VBWAIT; error = tsleep((caddr_t)&vp->v_numoutput, slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); if (error) { splx(s); return (error); } } if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { splx(s); if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) return (error); s = splbio(); if (vp->v_numoutput > 0 || !TAILQ_EMPTY(&vp->v_dirtyblkhd)) panic("vinvalbuf: dirty bufs"); } splx(s); } s = splbio(); for (;;) { blist = TAILQ_FIRST(&vp->v_cleanblkhd); if (!blist) blist = TAILQ_FIRST(&vp->v_dirtyblkhd); if (!blist) break; for (bp = blist; bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, "vinvalbuf", slpflag, slptimeo); if (error == ENOLCK) break; splx(s); return (error); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. Note that vfs_bio_awrite expects * buffers to reside on a queue, while VOP_BWRITE and * brelse do not. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { if (bp->b_vp == vp) { if (bp->b_flags & B_CLUSTEROK) { BUF_UNLOCK(bp); vfs_bio_awrite(bp); } else { bremfree(bp); bp->b_flags |= B_ASYNC; VOP_BWRITE(bp->b_vp, bp); } } else { bremfree(bp); (void) VOP_BWRITE(bp->b_vp, bp); } break; } bremfree(bp); bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); } splx(s); /* * Destroy the copy in the VM cache, too. */ simple_lock(&vp->v_interlock); object = vp->v_object; if (object != NULL) { vm_object_page_remove(object, 0, 0, (flags & V_SAVE) ? TRUE : FALSE); } simple_unlock(&vp->v_interlock); if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) panic("vinvalbuf: flush failed"); return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(vp, cred, p, length, blksize) register struct vnode *vp; struct ucred *cred; struct proc *p; off_t length; int blksize; { register struct buf *bp; struct buf *nbp; int s, anyfreed; int trunclbn; /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; s = splbio(); restart: anyfreed = 1; for (;anyfreed;) { anyfreed = 0; for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { goto restart; } } } for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if (bp->b_lblkno >= trunclbn) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; } if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { goto restart; } } } } if (length > 0) { restartsync: for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = TAILQ_NEXT(bp, b_vnbufs); if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); goto restart; } else { bremfree(bp); if (bp->b_vp == vp) { bp->b_flags |= B_ASYNC; } else { bp->b_flags &= ~B_ASYNC; } VOP_BWRITE(bp->b_vp, bp); } goto restartsync; } } } while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); } splx(s); vnode_pager_setsize(vp, length); return (0); } /* * Associate a buffer with a vnode. */ void bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { int s; KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); vhold(vp); bp->b_vp = vp; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; /* * Insert onto list for new vnode. */ s = splbio(); bp->b_xflags |= B_VNCLEAN; bp->b_xflags &= ~B_VNDIRTY; TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); splx(s); } /* * Disassociate a buffer from a vnode. */ void brelvp(bp) register struct buf *bp; { struct vnode *vp; struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; s = splbio(); if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &vp->v_dirtyblkhd; else listheadp = &vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); } if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } splx(s); bp->b_vp = (struct vnode *) 0; vdrop(vp); } /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syner process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct vnode *vp, int delay) { int s, slot; s = splbio(); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONWORKLST; splx(s); } struct proc *updateproc; static void sched_sync __P((void)); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) /* * System filesystem synchronizer daemon. */ void sched_sync(void) { struct synclist *slp; struct vnode *vp; long starttime; int s; struct proc *p = updateproc; + p->p_flag |= P_BUFEXHAUST; + for (;;) { starttime = time_second; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. */ s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { if (VOP_ISLOCKED(vp) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); VOP_UNLOCK(vp, 0, p); } s = splbio(); if (LIST_FIRST(slp) == vp) { /* * Note: v_tag VT_VFS vps can remain on the * worklist too with no dirty blocks, but * since sync_fsync() moves it to a different * slot we are safe. */ if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && vp->v_type != VBLK) panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); } /* * Do soft update processing. */ if (bioops.io_sync) (*bioops.io_sync)(NULL); /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (time_second == starttime) tsleep(&lbolt, PPAUSE, "syncer", 0); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer() { int s; s = splhigh(); if (updateproc->p_wchan == &lbolt) setrunnable(updateproc); splx(s); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; return (1); } return(0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); bp->b_vp = vp; bp->b_flags |= B_PAGING; if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev; else bp->b_dev = NODEV; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(bp) register struct buf *bp; { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); #if !defined(MAX_PERF) /* XXX REMOVE ME */ if (bp->b_vnbufs.tqe_next != NULL) { panic( "relpbuf(): b_vp was probably reassignbuf()d %p %x", bp, (int)bp->b_flags ); } #endif bp->b_vp = (struct vnode *) 0; bp->b_flags &= ~B_PAGING; } void pbreassignbuf(bp, newvp) struct buf *bp; struct vnode *newvp; { #if !defined(MAX_PERF) if ((bp->b_flags & B_PAGING) == 0) { panic( "pbreassignbuf() on non phys bp %p", bp ); } #endif bp->b_vp = newvp; } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { struct buflists *listheadp; int delay; int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } + ++reassignbufcalls; #if !defined(MAX_PERF) /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); #endif s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { if (bp->b_xflags & B_VNDIRTY) listheadp = &bp->b_vp->v_dirtyblkhd; else listheadp = &bp->b_vp->v_cleanblkhd; TAILQ_REMOVE(listheadp, bp, b_vnbufs); bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ } } /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { struct buf *tbp; listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: delay = dirdelay; break; case VBLK: if (newvp->v_specmountpoint != NULL) { delay = metadelay; break; } /* fall through */ default: delay = filedelay; } vn_syncer_add_to_worklist(newvp, delay); } bp->b_xflags |= B_VNDIRTY; tbp = TAILQ_FIRST(listheadp); if (tbp == NULL || - (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) { + bp->b_lblkno == 0 || + (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); - } else { - if (bp->b_lblkno >= 0) { - struct buf *ttbp; - while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && - (ttbp->b_lblkno < bp->b_lblkno)) { - tbp = ttbp; - } + ++reassignbufsortgood; + } else if (bp->b_lblkno < 0) { + TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + ++reassignbufsortgood; + } else if (reassignbufmethod == 1) { + /* + * New sorting algorithm, only handle sequential case, + * otherwise guess. + */ + if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && + (tbp->b_xflags & B_VNDIRTY)) { TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + ++reassignbufsortgood; } else { - TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); + ++reassignbufsortbad; + } + } else { + /* + * Old sorting algorithm, scan queue and insert + */ + struct buf *ttbp; + while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && + (ttbp->b_lblkno < bp->b_lblkno)) { + ++reassignbufloops; + tbp = ttbp; } + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); } } else { bp->b_xflags |= B_VNCLEAN; TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); } } if (bp->b_vp != newvp) { bp->b_vp = newvp; vhold(bp->b_vp); } splx(s); } /* * Create a vnode for a block device. * Used for mounting the root file system. */ int bdevvp(dev, vpp) dev_t dev; struct vnode **vpp; { register struct vnode *vp; struct vnode *nvp; int error; if (dev == NODEV) { *vpp = NULLVP; return (ENXIO); } error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); if (error) { *vpp = NULLVP; return (error); } vp = nvp; vp->v_type = VBLK; if ((nvp = checkalias(vp, dev2udev(dev), (struct mount *)0)) != NULL) { vput(vp); vp = nvp; } *vpp = vp; return (0); } /* * Check to see if the new vnode represents a special device * for which we already have a vnode (either because of * bdevvp() or because of a different vnode representing * the same block device). If such an alias exists, deallocate * the existing contents and return the aliased vnode. The * caller is responsible for filling it with its new contents. */ struct vnode * checkalias(nvp, nvp_rdev, mp) register struct vnode *nvp; udev_t nvp_rdev; struct mount *mp; { struct proc *p = curproc; /* XXX */ struct vnode *vp; struct vnode **vpp; dev_t dev; if (nvp->v_type != VBLK && nvp->v_type != VCHR) return (NULLVP); dev = udev2dev(nvp_rdev, 2); vpp = &speclisth[SPECHASH(dev)]; loop: simple_lock(&spechash_slock); for (vp = *vpp; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || nvp->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. * Only alias active device nodes. * Not sure why we don't re-use this like we do below. */ simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { simple_unlock(&spechash_slock); vgonel(vp, p); goto loop; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { /* * It dissappeared, and we may have slept. * Restart from the beginning */ simple_unlock(&spechash_slock); goto loop; } break; } /* * It would be a lot clearer what is going on here if * this had been expressed as: * if ( vp && (vp->v_tag == VT_NULL)) * and the clauses had been swapped. */ if (vp == NULL || vp->v_tag != VT_NON) { struct specinfo *sinfo; /* * Put the new vnode into the hash chain. * and if there was an alias, connect them. */ MALLOC(sinfo, struct specinfo *, sizeof(struct specinfo), M_VNODE, M_WAITOK); bzero(sinfo, sizeof(struct specinfo)); nvp->v_specinfo = sinfo; sinfo->si_rdev = dev; sinfo->si_hashchain = vpp; sinfo->si_specnext = *vpp; sinfo->si_bsize_phys = DEV_BSIZE; sinfo->si_bsize_best = BLKDEV_IOSIZE; sinfo->si_bsize_max = MAXBSIZE; /* * Ask the device to fix up specinfo. Typically the * si_bsize_* parameters may need fixing up. */ if (nvp->v_type == VBLK) { if (bdevsw(dev) && bdevsw(dev)->d_parms) (*bdevsw(dev)->d_parms)(dev, sinfo, DPARM_GET); } else if (nvp->v_type == VCHR) { if (devsw(dev) && devsw(dev)->d_parms) (*devsw(dev)->d_parms)(dev, sinfo, DPARM_GET); } simple_unlock(&spechash_slock); *vpp = nvp; if (vp != NULLVP) { nvp->v_flag |= VALIASED; vp->v_flag |= VALIASED; vput(vp); } return (NULLVP); } /* * if ( vp && (vp->v_tag == VT_NULL)) * We have a vnode alias, but it is a trashed. * Make it look like it's newley allocated. (by getnewvnode()) * The caller should use this instead. */ simple_unlock(&spechash_slock); VOP_UNLOCK(vp, 0, p); simple_lock(&vp->v_interlock); vclean(vp, 0, p); vp->v_op = nvp->v_op; vp->v_tag = nvp->v_tag; nvp->v_type = VNON; insmntque(vp, mp); return (vp); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. The vnode lock bit is set the * vnode is being eliminated in vgone. The process is awakened * when the transition is completed, and an error returned to * indicate that the vnode is no longer usable (possibly having * been changed to a new file system type). */ int vget(vp, flags, p) register struct vnode *vp; int flags; struct proc *p; { int error; /* * If the vnode is in the process of being cleaned out for * another use, we wait for the cleaning to finish and then * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vget", 0); return (ENOENT); } vp->v_usecount++; if (VSHOULDBUSY(vp)) vbusy(vp); if (flags & LK_TYPE_MASK) { if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { /* * must expand vrele here because we do not want * to call VOP_INACTIVE if the reference count * drops back to zero since it was never really * active. We must remove it from the free list * before sleeping so that multiple processes do * not try to recycle it. */ simple_lock(&vp->v_interlock); vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); simple_unlock(&vp->v_interlock); } return (error); } simple_unlock(&vp->v_interlock); return (0); } void vref(struct vnode *vp) { simple_lock(&vp->v_interlock); vp->v_usecount++; simple_unlock(&vp->v_interlock); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vrele: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; simple_unlock(&vp->v_interlock); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { VOP_INACTIVE(vp, p); } } else { #ifdef DIAGNOSTIC vprint("vrele: negative ref count", vp); simple_unlock(&vp->v_interlock); #endif panic("vrele: negative ref cnt"); } } void vput(vp) struct vnode *vp; { struct proc *p = curproc; /* XXX */ KASSERT(vp != NULL, ("vput: null vp")); simple_lock(&vp->v_interlock); if (vp->v_usecount > 1) { vp->v_usecount--; VOP_UNLOCK(vp, LK_INTERLOCK, p); return; } if (vp->v_usecount == 1) { vp->v_usecount--; if (VSHOULDFREE(vp)) vfree(vp); /* * If we are doing a vput, the node is already locked, and we must * call VOP_INACTIVE with the node locked. So, in the case of * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. */ simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } else { #ifdef DIAGNOSTIC vprint("vput: negative ref count", vp); #endif panic("vput: negative ref cnt"); } } /* * Somebody doesn't want the vnode recycled. */ void vhold(vp) register struct vnode *vp; { int s; s = splbio(); vp->v_holdcnt++; if (VSHOULDBUSY(vp)) vbusy(vp); splx(s); } /* * One less who cares about this vnode. */ void vdrop(vp) register struct vnode *vp; { int s; s = splbio(); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt"); vp->v_holdcnt--; if (VSHOULDFREE(vp)) vfree(vp); splx(s); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If MNT_NOFORCE is specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If MNT_FORCE is specified, detach any active vnodes * that are found. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int vflush(mp, skipvp, flags) struct mount *mp; struct vnode *skipvp; int flags; { struct proc *p = curproc; /* XXX */ struct vnode *vp, *nvp; int busy = 0; simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { /* * Make sure this vnode wasn't reclaimed in getnewvnode(). * Start over if it has (it won't be on the list anymore). */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; /* * Skip over a selected vnode. */ if (vp == skipvp) continue; simple_lock(&vp->v_interlock); /* * Skip over a vnodes marked VSYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { simple_unlock(&vp->v_interlock); continue; } /* * If WRITECLOSE is set, only flush out regular file vnodes * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { simple_unlock(&vp->v_interlock); continue; } /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. */ if (vp->v_usecount == 0) { simple_unlock(&mntvnode_slock); vgonel(vp, p); simple_lock(&mntvnode_slock); continue; } /* * If FORCECLOSE is set, forcibly close the vnode. For block * or character devices, revert to an anonymous device. For * all other files, just kill them. */ if (flags & FORCECLOSE) { simple_unlock(&mntvnode_slock); if (vp->v_type != VBLK && vp->v_type != VCHR) { vgonel(vp, p); } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; insmntque(vp, (struct mount *) 0); } simple_lock(&mntvnode_slock); continue; } #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif simple_unlock(&vp->v_interlock); busy++; } simple_unlock(&mntvnode_slock); if (busy) return (EBUSY); return (0); } /* * Disassociate the underlying file system from a vnode. */ static void vclean(vp, flags, p) struct vnode *vp; int flags; struct proc *p; { int active; vm_object_t obj; /* * Check to see if the vnode is in use. If so we have to reference it * before we clean it out so that its count cannot fall to zero and * generate a race against ourselves to recycle it. */ if ((active = vp->v_usecount)) vp->v_usecount++; /* * Prevent the vnode from being recycled or brought into use while we * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK * ensures that the VOP_INACTIVE routine is done with its work. * For active vnodes, it ensures that no other activity can * occur while the underlying object is being cleaned out. */ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* * Clean out any buffers associated with the vnode. */ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); if ((obj = vp->v_object) != NULL) { if (obj->ref_count == 0) { /* * This is a normal way of shutting down the object/vnode * association. */ vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); } } /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. Note that the * VOP_INACTIVE will unlock the vnode. */ if (active) { if (flags & DOCLOSE) VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); VOP_INACTIVE(vp, p); } else { /* * Any other processes trying to obtain this lock must first * wait for VXLOCK to clear, then call the new lock operation. */ VOP_UNLOCK(vp, 0, p); } /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim"); if (active) vrele(vp); cache_purge(vp); if (vp->v_vnlock) { #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ #ifdef DIAGNOSTIC if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) vprint("vclean: lock not drained", vp); #endif #endif FREE(vp->v_vnlock, M_VNODE); vp->v_vnlock = NULL; } if (VSHOULDFREE(vp)) vfree(vp); /* * Done with purge, notify sleepers of the grim news. */ vp->v_op = dead_vnodeop_p; vn_pollgone(vp); vp->v_tag = VT_NON; vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup((caddr_t) vp); } } /* * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int vop_revoke(ap) struct vop_revoke_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp, *vq; struct proc *p = curproc; /* XXX */ KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); vp = ap->a_vp; simple_lock(&vp->v_interlock); if (vp->v_flag & VALIASED) { /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); return (0); } /* * Ensure that vp will not be vgone'd while we * are eliminating its aliases. */ vp->v_flag |= VXLOCK; simple_unlock(&vp->v_interlock); while (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type || vp == vq) continue; simple_unlock(&spechash_slock); vgone(vq); break; } if (vq == NULLVP) { simple_unlock(&spechash_slock); } } /* * Remove the lock so that vgone below will * really eliminate the vnode after which time * vgone will awaken any sleepers. */ simple_lock(&vp->v_interlock); vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; wakeup(vp); } } vgonel(vp, p); return (0); } /* * Recycle an unused vnode to the front of the free list. * Release the passed interlock if the vnode will be recycled. */ int vrecycle(vp, inter_lkp, p) struct vnode *vp; struct simplelock *inter_lkp; struct proc *p; { simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { if (inter_lkp) { simple_unlock(inter_lkp); } vgonel(vp, p); return (1); } simple_unlock(&vp->v_interlock); return (0); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(vp) register struct vnode *vp; { struct proc *p = curproc; /* XXX */ simple_lock(&vp->v_interlock); vgonel(vp, p); } /* * vgone, with the vp interlock held. */ static void vgonel(vp, p) struct vnode *vp; struct proc *p; { int s; struct vnode *vq; struct vnode *vx; /* * If a vgone (or vclean) is already in progress, * wait until it is done and return. */ if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vgone", 0); return; } /* * Clean out the filesystem specific data. */ vclean(vp, DOCLOSE, p); simple_lock(&vp->v_interlock); /* * Delete from old mount point vnode list, if on one. */ if (vp->v_mount != NULL) insmntque(vp, (struct mount *)0); /* * If special device, remove it from special device alias list * if it is on one. */ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { simple_lock(&spechash_slock); if (*vp->v_hashchain == vp) { *vp->v_hashchain = vp->v_specnext; } else { for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_specnext != vp) continue; vq->v_specnext = vp->v_specnext; break; } if (vq == NULL) panic("missing bdev"); } if (vp->v_flag & VALIASED) { vx = NULL; for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vx) break; vx = vq; } if (vx == NULL) panic("missing alias"); if (vq == NULL) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } simple_unlock(&spechash_slock); FREE(vp->v_specinfo, M_VNODE); vp->v_specinfo = NULL; } /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the back * pointer and the reference count of zero is because * it will be removed from the free list by getnewvnode, * but will not have its reference count incremented until * after calling vgone. If the reference count were * incremented first, vgone would (incorrectly) try to * close the previous instance of the underlying object. */ if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VFREE) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); } else if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; freevnodes++; } else freevnodes++; vp->v_flag |= VFREE; TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); splx(s); } vp->v_type = VBAD; simple_unlock(&vp->v_interlock); } /* * Lookup a vnode by device number. */ int vfinddev(dev, type, vpp) dev_t dev; enum vtype type; struct vnode **vpp; { register struct vnode *vp; int rc = 0; simple_lock(&spechash_slock); for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (dev != vp->v_rdev || type != vp->v_type) continue; *vpp = vp; rc = 1; break; } simple_unlock(&spechash_slock); return (rc); } /* * Calculate the total number of references to a special device. */ int vcount(vp) register struct vnode *vp; { struct vnode *vq, *vnext; int count; loop: if ((vp->v_flag & VALIASED) == 0) return (vp->v_usecount); simple_lock(&spechash_slock); for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { vnext = vq->v_specnext; if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; /* * Alias, but not in use, so flush it out. */ if (vq->v_usecount == 0 && vq != vp) { simple_unlock(&spechash_slock); vgone(vq); goto loop; } count += vq->v_usecount; } simple_unlock(&spechash_slock); return (count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) char *label; register struct vnode *vp; { char buf[96]; if (label != NULL) printf("%s: %p: ", label, (void *)vp); else printf("%p: ", (void *)vp); printf("type %s, usecount %d, writecount %d, refcount %d,", typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); if (vp->v_flag & VTEXT) strcat(buf, "|VTEXT"); if (vp->v_flag & VSYSTEM) strcat(buf, "|VSYSTEM"); if (vp->v_flag & VXLOCK) strcat(buf, "|VXLOCK"); if (vp->v_flag & VXWANT) strcat(buf, "|VXWANT"); if (vp->v_flag & VBWAIT) strcat(buf, "|VBWAIT"); if (vp->v_flag & VALIASED) strcat(buf, "|VALIASED"); if (vp->v_flag & VDOOMED) strcat(buf, "|VDOOMED"); if (vp->v_flag & VFREE) strcat(buf, "|VFREE"); if (vp->v_flag & VOBJBUF) strcat(buf, "|VOBJBUF"); if (buf[0] != '\0') printf(" flags (%s)", &buf[1]); if (vp->v_data == NULL) { printf("\n"); } else { printf("\n\t"); VOP_PRINT(vp); } } #ifdef DDB #include /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *vp; printf("Locked vnodes\n"); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = vp->v_mntvnodes.le_next) { if (VOP_ISLOCKED(vp)) vprint((char *)0, vp); } simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); static int vfs_sysctl SYSCTL_HANDLER_ARGS { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif #ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ if (name[0] != VFS_GENERIC) { for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[0]) break; if (vfsp == NULL) return (EOPNOTSUPP); return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf SYSCTL_HANDLER_ARGS { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error) return error; } return 0; } #endif /* 1 || COMPAT_PRELITE2 */ #if 0 #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ static int sysctl_vnode SYSCTL_HANDLER_ARGS { struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) req->lock = 0; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } again: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { /* * Check that the vp is still associated with * this filesystem. RACE: could have been * recycled onto the same filesystem. */ if (vp->v_mount != mp) { simple_unlock(&mntvnode_slock); goto again; } nvp = vp->v_mntvnodes.le_next; simple_unlock(&mntvnode_slock); if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); simple_lock(&mountlist_slock); nmp = mp->mnt_list.cqe_next; vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); return (0); } #endif /* * XXX * Exporting the vnode list on large systems causes them to crash. * Exporting the vnode list on medium systems causes sysctl to coredump. */ #if 0 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_vnode, "S,vnode", ""); #endif /* * Check to see if a filesystem is mounted on a block device. */ int vfs_mountedon(vp) struct vnode *vp; { struct vnode *vq; int error = 0; if (vp->v_specmountpoint != NULL) return (EBUSY); if (vp->v_flag & VALIASED) { simple_lock(&spechash_slock); for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) continue; if (vq->v_specmountpoint != NULL) { error = EBUSY; break; } } simple_unlock(&spechash_slock); } return (error); } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall() { struct mount *mp, *nmp; struct proc *p; int error; if (curproc != NULL) p = curproc; else p = initproc; /* XXX XXX should this be proc0? */ /* * Since this only runs when rebooting, it is not interlocked. */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; error = dounmount(mp, MNT_FORCE, p); if (error) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } } /* * Build hash lists of net addresses and hang them off the mount point. * Called by ufs_mount() to set up the lists of export addresses. */ static int vfs_hang_addrlist(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { register struct netcred *np; register struct radix_node_head *rnh; register int i; struct radix_node *rn; struct sockaddr *saddr, *smask = 0; struct domain *dom; int error; if (argp->ex_addrlen == 0) { if (mp->mnt_flag & MNT_DEFEXPORTED) return (EPERM); np = &nep->ne_defexported; np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; mp->mnt_flag |= MNT_DEFEXPORTED; return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); bzero((caddr_t) np, i); saddr = (struct sockaddr *) (np + 1); if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; } i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* * Seems silly to initialize every AF when most are not used, * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { dom->dom_rtattach((void **) &nep->ne_rtable[i], dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { error = ENOBUFS; goto out; } } rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, np->netc_rnodes); if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ error = EPERM; goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; np->netc_anon.cr_ref = 1; return (0); out: free(np, M_NETADDR); return (error); } /* ARGSUSED */ static int vfs_free_netcred(rn, w) struct radix_node *rn; void *w; { register struct radix_node_head *rnh = (struct radix_node_head *) w; (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); free((caddr_t) rn, M_NETADDR); return (0); } /* * Free the net address hash lists that are hanging off the mount points. */ static void vfs_free_addrlist(nep) struct netexport *nep; { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) if ((rnh = nep->ne_rtable[i])) { (*rnh->rnh_walktree) (rnh, vfs_free_netcred, (caddr_t) rnh); free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } int vfs_export(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; if (argp->ex_flags & MNT_DELEXPORT) { if (mp->mnt_flag & MNT_EXPUBLIC) { vfs_setpublicfs(NULL, NULL, NULL); mp->mnt_flag &= ~MNT_EXPUBLIC; } vfs_free_addrlist(nep); mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) return (error); mp->mnt_flag |= MNT_EXPUBLIC; } if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } return (0); } /* * Set the publicly exported filesystem (WebNFS). Currently, only * one public filesystem is possible in the spec (RFC 2054 and 2055) */ int vfs_setpublicfs(mp, nep, argp) struct mount *mp; struct netexport *nep; struct export_args *argp; { int error; struct vnode *rvp; char *cp; /* * mp == NULL -> invalidate the current info, the FS is * no longer exported. May be called from either vfs_export * or unmount, so check if it hasn't already been done. */ if (mp == NULL) { if (nfs_pub.np_valid) { nfs_pub.np_valid = 0; if (nfs_pub.np_index != NULL) { FREE(nfs_pub.np_index, M_TEMP); nfs_pub.np_index = NULL; } } return (0); } /* * Only one allowed at a time. */ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) return (EBUSY); /* * Get real filehandle for root of exported FS. */ bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; if ((error = VFS_ROOT(mp, &rvp))) return (error); if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) return (error); vput(rvp); /* * If an indexfile was specified, pull it in. */ if (argp->ex_indexfile != NULL) { MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, M_WAITOK); error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, MAXNAMLEN, (size_t *)0); if (!error) { /* * Check for illegal filenames. */ for (cp = nfs_pub.np_index; *cp; cp++) { if (*cp == '/') { error = EINVAL; break; } } } if (error) { FREE(nfs_pub.np_index, M_TEMP); return (error); } } nfs_pub.np_mount = mp; nfs_pub.np_valid = 1; return (0); } struct netcred * vfs_export_lookup(mp, nep, nam) register struct mount *mp; struct netexport *nep; struct sockaddr *nam; { register struct netcred *np; register struct radix_node_head *rnh; struct sockaddr *saddr; np = NULL; if (mp->mnt_flag & MNT_EXPORTED) { /* * Lookup in the export list first. */ if (nam != NULL) { saddr = nam; rnh = nep->ne_rtable[saddr->sa_family]; if (rnh != NULL) { np = (struct netcred *) (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); if (np && np->netc_rnodes->rn_flags & RNF_ROOT) np = NULL; } } /* * If no address match, use the default if it exists. */ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) np = &nep->ne_defexported; } return (np); } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *nvp; struct vm_object *obj; int anyio, tries; tries = 5; loop: anyio = 0; for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { nvp = vp->v_mntvnodes.le_next; if (vp->v_mount != mp) { goto loop; } if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ continue; if (flags != MNT_WAIT) { obj = vp->v_object; if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) continue; if (VOP_ISLOCKED(vp)) continue; } simple_lock(&vp->v_interlock); if (vp->v_object && (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { if (!vget(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { if (vp->v_object) { vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); anyio = 1; } vput(vp); } } else { simple_unlock(&vp->v_interlock); } } if (anyio && (--tries > 0)) goto loop; } /* * Create the VM object needed for VMIO and mmap support. This * is done for all VREG files in the system. Some filesystems might * afford the additional metadata buffering capability of the * VMIO code by making the device node be VMIO mode also. * * vp must be locked when vfs_object_create is called. */ int vfs_object_create(vp, p, cred) struct vnode *vp; struct proc *p; struct ucred *cred; { struct vattr vat; vm_object_t object; int error = 0; if ((vp->v_type != VREG) && (vp->v_type != VBLK)) return 0; retry: if ((object = vp->v_object) == NULL) { if (vp->v_type == VREG) { if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) goto retn; object = vnode_pager_alloc(vp, vat.va_size, 0, 0); } else if (bdevsw(vp->v_rdev) != NULL) { /* * This simply allocates the biggest object possible * for a VBLK vnode. This should be fixed, but doesn't * cause any problems (yet). */ object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); } else { goto retn; } /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ object->ref_count--; vp->v_usecount--; } else { if (object->flags & OBJ_DEAD) { VOP_UNLOCK(vp, 0, p); tsleep(object, PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); goto retry; } } KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); vp->v_flag |= VOBJBUF; retn: return error; } static void vfree(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } if (vp->v_flag & VAGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); } freevnodes++; simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~VAGE; vp->v_flag |= VFREE; splx(s); } void vbusy(vp) struct vnode *vp; { int s; s = splbio(); simple_lock(&vnode_free_list_slock); if (vp->v_flag & VTBFREE) { TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); vp->v_flag &= ~VTBFREE; } else { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); freevnodes--; } simple_unlock(&vnode_free_list_slock); vp->v_flag &= ~(VFREE|VAGE); splx(s); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(vp, p, events) struct vnode *vp; struct proc *p; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo.vpi_revents; vp->v_pollinfo.vpi_revents &= ~events; simple_unlock(&vp->v_pollinfo.vpi_lock); return events; } vp->v_pollinfo.vpi_events |= events; selrecord(p, &vp->v_pollinfo.vpi_selinfo); simple_unlock(&vp->v_pollinfo.vpi_lock); return 0; } /* * Note the occurrence of an event. If the VN_POLLEVENT macro is used, * it is possible for us to miss an event due to race conditions, but * that condition is expected to be rare, so for the moment it is the * preferred interface. */ void vn_pollevent(vp, events) struct vnode *vp; short events; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events & events) { /* * We clear vpi_events so that we don't * call selwakeup() twice if two events are * posted before the polling process(es) is * awakened. This also ensures that we take at * most one selwakeup() if the polling process * is no longer interested. However, it does * mean that only one event can be noticed at * a time. (Perhaps we should only clear those * event bits which we note?) XXX */ vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ vp->v_pollinfo.vpi_revents |= events; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Wake up anyone polling on vp because it is being revoked. * This depends on dead_poll() returning POLLHUP for correct * behavior. */ void vn_pollgone(vp) struct vnode *vp; { simple_lock(&vp->v_pollinfo.vpi_lock); if (vp->v_pollinfo.vpi_events) { vp->v_pollinfo.vpi_events = 0; selwakeup(&vp->v_pollinfo.vpi_selinfo); } simple_unlock(&vp->v_pollinfo.vpi_lock); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) static int sync_fsync __P((struct vop_fsync_args *)); static int sync_inactive __P((struct vop_inactive_args *)); static int sync_reclaim __P((struct vop_reclaim_args *)); #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) static int sync_print __P((struct vop_print_args *)); #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) static vop_t **sync_vnodeop_p; static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { { &vop_default_desc, (vop_t *) vop_eopnotsupp }, { &vop_close_desc, (vop_t *) sync_close }, /* close */ { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ { &vop_print_desc, (vop_t *) sync_print }, /* print */ { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ { NULL, NULL } }; static struct vnodeopv_desc sync_vnodeop_opv_desc = { &sync_vnodeop_p, sync_vnodeop_entries }; VNODEOP_SET(sync_vnodeop_opv_desc); /* * Create a new filesystem syncer vnode for the specified mount point. */ int vfs_allocate_syncvnode(mp) struct mount *mp; { struct vnode *vp; static long start, incr, next; int error; /* Allocate a new vnode */ if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { mp->mnt_syncer = NULL; return (error); } vp->v_type = VNON; /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); mp->mnt_syncer = vp; return (0); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; struct ucred *a_cred; int a_waitfor; struct proc *a_p; } */ *ap; { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; struct proc *p = ap->a_p; int asyncflag; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ vn_syncer_add_to_worklist(syncvp, syncdelay); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ simple_lock(&mountlist_slock); if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { simple_unlock(&mountlist_slock); return (0); } asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; vfs_unbusy(mp, p); return (0); } /* * The syncer vnode is no referenced. */ static int sync_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct proc *a_p; } */ *ap; { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected at splbio(). */ static int sync_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; int s; s = splbio(); vp->v_mount->mnt_syncer = NULL; if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; } splx(s); return (0); } /* * Print out a syncer vnode. */ static int sync_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; printf("syncer vnode"); if (vp->v_vnlock != NULL) lockmgr_printinfo(vp->v_vnlock); printf("\n"); return (0); } diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 42f26e437968..e6d23d86d9cc 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -1,514 +1,514 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.73 1999/06/27 11:40:03 peter Exp $ + * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include struct buf; struct mount; struct vnode; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start) __P((struct buf *)); void (*io_complete) __P((struct buf *)); void (*io_deallocate) __P((struct buf *)); int (*io_fsync) __P((struct vnode *)); int (*io_sync) __P((struct mount *)); } bioops; struct iodone_chain { long ic_prev_flags; void (*ic_prev_iodone) __P((struct buf *)); void *ic_prev_iodone_chain; struct { long ia_long; void *ia_ptr; } ic_args[5]; }; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ unsigned char b_usecount; /* buffer use count */ unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ caddr_t b_data; /* Memory, superblocks, indirect etc. */ caddr_t b_kvabase; /* base kva for buffer */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); /* For nested b_iodone's. */ struct iodone_chain *b_iodone_chain; struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ daddr_t b_pblkno; /* physical block number */ void *b_saveaddr; /* Original b_addr for physio. */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_caller1; /* for private use by the driver */ void *b_caller2; /* for private use by the driver */ union pager_info { void *pg_spc; int pg_reqpage; } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* List of filesystem dependencies. */ struct chain_info { /* buffer chaining */ struct buf *parent; int count; } b_chain; }; #define b_spc b_pager.pg_spc /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear B_ERROR|B_INVAL, * set B_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_UNUSED1 0x00000010 /* Old B_BUSY */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_FREEBUF 0x00000100 /* Instruct driver: free blocks */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_SCANNED 0x00001000 /* VOP_FSYNC funcs mark written bufs */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_DIRTY 0x00200000 /* Needs writing later. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_WANT 0x00800000 /* Used by vm_pager.c */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_ORDERED 0x08000000 /* Must guarantee I/O ordering */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_AUTOCHAINDONE 0x80000000 /* Available flag */ #define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \ "\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ "\10delwri\7call\6cache\4bad\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define B_VNDIRTY 0x01 /* On vnode dirty list */ #define B_VNCLEAN 0x02 /* On vnode clean list */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ #ifdef KERNEL /* * Buffer locking */ struct simplelock buftimelock; /* Interlock on setting prio and timo */ extern char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curproc */ /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ static __inline int BUF_LOCK __P((struct buf *, int)); static __inline int BUF_LOCK(struct buf *bp, int locktype) { int s, ret; s = splbio(); simple_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = buf_wmesg; bp->b_lock.lk_prio = PRIBIO + 4; bp->b_lock.lk_timo = 0; ret = lockmgr(&(bp)->b_lock, locktype, &buftimelock, curproc); splx(s); return ret; } /* * Get a lock sleeping with specified interruptably and timeout. */ static __inline int BUF_TIMELOCK __P((struct buf *, int, char *, int, int)); static __inline int BUF_TIMELOCK(struct buf *bp, int locktype, char *wmesg, int catch, int timo) { int s, ret; s = splbio(); simple_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = wmesg; bp->b_lock.lk_prio = (PRIBIO + 4) | catch; bp->b_lock.lk_timo = timo; ret = lockmgr(&(bp)->b_lock, (locktype), &buftimelock, curproc); splx(s); return ret; } /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ static __inline void BUF_UNLOCK __P((struct buf *)); static __inline void BUF_UNLOCK(struct buf *bp) { int s; s = splbio(); lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curproc); splx(s); } /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ if (BUF_REFCNT(bp) > 0) \ panic("free locked buf") /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ static __inline void BUF_KERNPROC __P((struct buf *)); static __inline void BUF_KERNPROC(struct buf *bp) { bp->b_lock.lk_lockholder = LK_KERNPROC; } /* * Find out the number of references to a lock. */ static __inline int BUF_REFCNT __P((struct buf *)); static __inline int BUF_REFCNT(struct buf *bp) { int s, ret; s = splbio(); ret = lockcount(&(bp)->b_lock); splx(s); return ret; } #endif /* KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef KERNEL static __inline void bufq_init __P((struct buf_queue_head *head)); static __inline void bufq_insert_tail __P((struct buf_queue_head *head, struct buf *bp)); static __inline void bufq_remove __P((struct buf_queue_head *head, struct buf *bp)); static __inline struct buf *bufq_first __P((struct buf_queue_head *head)); static __inline void bufq_init(struct buf_queue_head *head) { TAILQ_INIT(&head->queue); head->last_pblkno = 0; head->insert_point = NULL; head->switch_point = NULL; } static __inline void bufq_insert_tail(struct buf_queue_head *head, struct buf *bp) { if ((bp->b_flags & B_ORDERED) != 0) { head->insert_point = bp; head->switch_point = NULL; } TAILQ_INSERT_TAIL(&head->queue, bp, b_act); } static __inline void bufq_remove(struct buf_queue_head *head, struct buf *bp) { if (bp == head->switch_point) head->switch_point = TAILQ_NEXT(bp, b_act); if (bp == head->insert_point) { head->insert_point = TAILQ_PREV(bp, buf_queue, b_act); if (head->insert_point == NULL) head->last_pblkno = 0; } else if (bp == TAILQ_FIRST(&head->queue)) head->last_pblkno = bp->b_pblkno; TAILQ_REMOVE(&head->queue, bp, b_act); if (TAILQ_FIRST(&head->queue) == head->switch_point) head->switch_point = NULL; } static __inline struct buf * bufq_first(struct buf_queue_head *head) { return (TAILQ_FIRST(&head->queue)); } #endif /* KERNEL */ /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ -#define QUEUE_LRU 2 /* useful buffers */ -#define QUEUE_VMIO 3 /* VMIO buffers */ -#define QUEUE_AGE 4 /* not-useful buffers */ -#define QUEUE_EMPTY 5 /* empty buffer headers*/ +#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ +#define QUEUE_DIRTY 3 /* B_DELWRI buffers */ +#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ +#define QUEUE_EMPTY 5 /* empty buffer headers */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL extern int nbuf; /* The number of buffer headers */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; struct uio; void bufinit __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void bdirty __P((struct buf *)); void bundirty __P((struct buf *)); int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); struct buf * getpbuf __P((int *)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); int physread __P((dev_t dev, struct uio *uio, int ioflag)); int physwrite __P((dev_t dev, struct uio *uio, int ioflag)); u_int minphys __P((struct buf *)); void vfs_bio_set_validclean __P((struct buf *, int base, int size)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages __P((struct buf *)); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); void relpbuf __P((struct buf *, int *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 42f26e437968..e6d23d86d9cc 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,514 +1,514 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 - * $Id: buf.h,v 1.73 1999/06/27 11:40:03 peter Exp $ + * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include struct buf; struct mount; struct vnode; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start) __P((struct buf *)); void (*io_complete) __P((struct buf *)); void (*io_deallocate) __P((struct buf *)); int (*io_fsync) __P((struct vnode *)); int (*io_sync) __P((struct mount *)); } bioops; struct iodone_chain { long ic_prev_flags; void (*ic_prev_iodone) __P((struct buf *)); void *ic_prev_iodone_chain; struct { long ia_long; void *ia_ptr; } ic_args[5]; }; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. */ struct buf { LIST_ENTRY(buf) b_hash; /* Hash chain. */ TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ TAILQ_ENTRY(buf) b_act; /* Device driver queue when active. *new* */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ unsigned char b_usecount; /* buffer use count */ unsigned char b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ int b_error; /* Errno value. */ long b_bufsize; /* Allocated buffer size. */ long b_bcount; /* Valid bytes in buffer. */ long b_resid; /* Remaining I/O. */ dev_t b_dev; /* Device associated with buffer. */ caddr_t b_data; /* Memory, superblocks, indirect etc. */ caddr_t b_kvabase; /* base kva for buffer */ int b_kvasize; /* size of kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); /* For nested b_iodone's. */ struct iodone_chain *b_iodone_chain; struct vnode *b_vp; /* Device vnode. */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ daddr_t b_pblkno; /* physical block number */ void *b_saveaddr; /* Original b_addr for physio. */ void *b_driver1; /* for private use by the driver */ void *b_driver2; /* for private use by the driver */ void *b_caller1; /* for private use by the driver */ void *b_caller2; /* for private use by the driver */ union pager_info { void *pg_spc; int pg_reqpage; } b_pager; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* List of filesystem dependencies. */ struct chain_info { /* buffer chaining */ struct buf *parent; int count; } b_chain; }; #define b_spc b_pager.pg_spc /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear B_ERROR|B_INVAL, * set B_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_UNUSED1 0x00000010 /* Old B_BUSY */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_CALL 0x00000040 /* Call b_iodone from biodone. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_FREEBUF 0x00000100 /* Instruct driver: free blocks */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_ERROR 0x00000800 /* I/O error occurred. */ #define B_SCANNED 0x00001000 /* VOP_FSYNC funcs mark written bufs */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_DIRTY 0x00200000 /* Needs writing later. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_WANT 0x00800000 /* Used by vm_pager.c */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_ORDERED 0x08000000 /* Must guarantee I/O ordering */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_AUTOCHAINDONE 0x80000000 /* Available flag */ #define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \ "\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \ "\25read\24raw\23phys\22clusterok\21malloc\20nocache" \ "\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \ "\10delwri\7call\6cache\4bad\3async\2needcommit\1age" /* * These flags are kept in b_xflags. */ #define B_VNDIRTY 0x01 /* On vnode dirty list */ #define B_VNCLEAN 0x02 /* On vnode clean list */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ #ifdef KERNEL /* * Buffer locking */ struct simplelock buftimelock; /* Interlock on setting prio and timo */ extern char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curproc */ /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) /* * * Get a lock sleeping non-interruptably until it becomes available. */ static __inline int BUF_LOCK __P((struct buf *, int)); static __inline int BUF_LOCK(struct buf *bp, int locktype) { int s, ret; s = splbio(); simple_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = buf_wmesg; bp->b_lock.lk_prio = PRIBIO + 4; bp->b_lock.lk_timo = 0; ret = lockmgr(&(bp)->b_lock, locktype, &buftimelock, curproc); splx(s); return ret; } /* * Get a lock sleeping with specified interruptably and timeout. */ static __inline int BUF_TIMELOCK __P((struct buf *, int, char *, int, int)); static __inline int BUF_TIMELOCK(struct buf *bp, int locktype, char *wmesg, int catch, int timo) { int s, ret; s = splbio(); simple_lock(&buftimelock); locktype |= LK_INTERLOCK; bp->b_lock.lk_wmesg = wmesg; bp->b_lock.lk_prio = (PRIBIO + 4) | catch; bp->b_lock.lk_timo = timo; ret = lockmgr(&(bp)->b_lock, (locktype), &buftimelock, curproc); splx(s); return ret; } /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ static __inline void BUF_UNLOCK __P((struct buf *)); static __inline void BUF_UNLOCK(struct buf *bp) { int s; s = splbio(); lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curproc); splx(s); } /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ if (BUF_REFCNT(bp) > 0) \ panic("free locked buf") /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ static __inline void BUF_KERNPROC __P((struct buf *)); static __inline void BUF_KERNPROC(struct buf *bp) { bp->b_lock.lk_lockholder = LK_KERNPROC; } /* * Find out the number of references to a lock. */ static __inline int BUF_REFCNT __P((struct buf *)); static __inline int BUF_REFCNT(struct buf *bp) { int s, ret; s = splbio(); ret = lockcount(&(bp)->b_lock); splx(s); return ret; } #endif /* KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster * callback uses the structure to parcel I/O's to individual buffers, and * then free's this structure. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ void *bs_saveaddr; /* Saved b_addr. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef KERNEL static __inline void bufq_init __P((struct buf_queue_head *head)); static __inline void bufq_insert_tail __P((struct buf_queue_head *head, struct buf *bp)); static __inline void bufq_remove __P((struct buf_queue_head *head, struct buf *bp)); static __inline struct buf *bufq_first __P((struct buf_queue_head *head)); static __inline void bufq_init(struct buf_queue_head *head) { TAILQ_INIT(&head->queue); head->last_pblkno = 0; head->insert_point = NULL; head->switch_point = NULL; } static __inline void bufq_insert_tail(struct buf_queue_head *head, struct buf *bp) { if ((bp->b_flags & B_ORDERED) != 0) { head->insert_point = bp; head->switch_point = NULL; } TAILQ_INSERT_TAIL(&head->queue, bp, b_act); } static __inline void bufq_remove(struct buf_queue_head *head, struct buf *bp) { if (bp == head->switch_point) head->switch_point = TAILQ_NEXT(bp, b_act); if (bp == head->insert_point) { head->insert_point = TAILQ_PREV(bp, buf_queue, b_act); if (head->insert_point == NULL) head->last_pblkno = 0; } else if (bp == TAILQ_FIRST(&head->queue)) head->last_pblkno = bp->b_pblkno; TAILQ_REMOVE(&head->queue, bp, b_act); if (TAILQ_FIRST(&head->queue) == head->switch_point) head->switch_point = NULL; } static __inline struct buf * bufq_first(struct buf_queue_head *head) { return (TAILQ_FIRST(&head->queue)); } #endif /* KERNEL */ /* * number of buffer hash entries */ #define BUFHSZ 512 /* * buffer hash table calculation, originally by David Greenman */ #define BUFHASH(vnp, bn) \ (&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ]) /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 6 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_LOCKED 1 /* locked buffers */ -#define QUEUE_LRU 2 /* useful buffers */ -#define QUEUE_VMIO 3 /* VMIO buffers */ -#define QUEUE_AGE 4 /* not-useful buffers */ -#define QUEUE_EMPTY 5 /* empty buffer headers*/ +#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ +#define QUEUE_DIRTY 3 /* B_DELWRI buffers */ +#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ +#define QUEUE_EMPTY 5 /* empty buffer headers */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef KERNEL extern int nbuf; /* The number of buffer headers */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; struct uio; void bufinit __P((void)); void bremfree __P((struct buf *)); int bread __P((struct vnode *, daddr_t, int, struct ucred *, struct buf **)); int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **)); int bwrite __P((struct buf *)); void bdwrite __P((struct buf *)); void bawrite __P((struct buf *)); void bdirty __P((struct buf *)); void bundirty __P((struct buf *)); int bowrite __P((struct buf *)); void brelse __P((struct buf *)); void bqrelse __P((struct buf *)); int vfs_bio_awrite __P((struct buf *)); struct buf * getpbuf __P((int *)); struct buf *incore __P((struct vnode *, daddr_t)); struct buf *gbincore __P((struct vnode *, daddr_t)); int inmem __P((struct vnode *, daddr_t)); struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); struct buf *geteblk __P((int)); int biowait __P((struct buf *)); void biodone __P((struct buf *)); void cluster_callback __P((struct buf *)); int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **)); int cluster_wbuild __P((struct vnode *, long, daddr_t, int)); void cluster_write __P((struct buf *, u_quad_t)); int physio __P((void (*)(struct buf *), struct buf *, dev_t, int, u_int (*)(struct buf *), struct uio *)); int physread __P((dev_t dev, struct uio *uio, int ioflag)); int physwrite __P((dev_t dev, struct uio *uio, int ioflag)); u_int minphys __P((struct buf *)); void vfs_bio_set_validclean __P((struct buf *, int base, int size)); void vfs_bio_clrbuf __P((struct buf *)); void vfs_busy_pages __P((struct buf *, int clear_modify)); void vfs_unbusy_pages __P((struct buf *)); void vwakeup __P((struct buf *)); void vmapbuf __P((struct buf *)); void vunmapbuf __P((struct buf *)); void relpbuf __P((struct buf *, int *)); void brelvp __P((struct buf *)); void bgetvp __P((struct vnode *, struct buf *)); void pbgetvp __P((struct vnode *, struct buf *)); void pbrelvp __P((struct buf *)); int allocbuf __P((struct buf *bp, int size)); void reassignbuf __P((struct buf *, struct vnode *)); void pbreassignbuf __P((struct buf *, struct vnode *)); struct buf *trypbuf __P((int *)); #endif /* KERNEL */ #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index e483763917d2..5d41ccfeea9e 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -1,268 +1,269 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernel.h 8.3 (Berkeley) 1/21/94 - * $Id: kernel.h,v 1.55 1999/05/06 13:42:25 peter Exp $ + * $Id: kernel.h,v 1.56 1999/07/01 13:21:43 peter Exp $ */ #ifndef _SYS_KERNEL_H_ #define _SYS_KERNEL_H_ #include #ifdef KERNEL /* Global variables for the kernel. */ /* 1.1 */ extern long hostid; extern char hostname[MAXHOSTNAMELEN]; extern int hostnamelen; extern char domainname[MAXHOSTNAMELEN]; extern int domainnamelen; extern char kernelname[MAXPATHLEN]; /* 1.2 */ extern struct timeval boottime; extern struct timezone tz; /* XXX */ extern int tick; /* usec per tick (1000000 / hz) */ extern int tickadj; /* "standard" clock skew, us./tick */ extern int hz; /* system clock's frequency */ extern int psratio; /* ratio: prof / stat */ extern int stathz; /* statistics clock's frequency */ extern int profhz; /* profiling clock's frequency */ extern int ticks; extern int lbolt; /* once a second sleep address */ extern int tickdelta; extern long timedelta; #endif /* KERNEL */ /* * Enumerated types for known system startup interfaces. * * Startup occurs in ascending numeric order; the list entries are * sorted prior to attempting startup to guarantee order. Items * of the same level are arbitrated for order based on the 'order' * element. * * These numbers are arbitrary and are chosen ONLY for ordering; the * enumeration values are explicit rather than implicit to provide * for binary compatibility with inserted elements. * * The SI_SUB_RUN_SCHEDULER value must have the highest lexical value. * * The SI_SUB_CONSOLE and SI_SUB_SWAP values represent values used by * the BSD 4.4Lite but not by FreeBSD; they are maintained in dependent * order to support porting. * * The SI_SUB_PROTO_BEGIN and SI_SUB_PROTO_END bracket a range of * initializations to take place at splimp(). This is a historical * wart that should be removed -- probably running everything at * splimp() until the first init that doesn't want it is the correct * fix. They are currently present to ensure historical behavior. */ enum sysinit_sub_id { SI_SUB_DUMMY = 0x0000000, /* not executed; for linker*/ SI_SUB_DONE = 0x0000001, /* processed*/ SI_SUB_CONSOLE = 0x0800000, /* console*/ SI_SUB_COPYRIGHT = 0x0800001, /* first use of console*/ SI_SUB_VM = 0x1000000, /* virtual memory system init*/ SI_SUB_KMEM = 0x1800000, /* kernel memory*/ SI_SUB_CPU = 0x2000000, /* CPU resource(s)*/ SI_SUB_KLD = 0x2100000, /* KLD and module setup */ SI_SUB_INTRINSIC = 0x2200000, /* proc 0*/ SI_SUB_DEVFS = 0x2300000, /* get DEVFS ready */ SI_SUB_DRIVERS = 0x2400000, /* Let Drivers initialize */ SI_SUB_CONFIGURE = 0x2500000, /* Configure devices */ SI_SUB_RUN_QUEUE = 0x3000000, /* the run queue*/ SI_SUB_VM_CONF = 0x3800000, /* config VM, set limits*/ SI_SUB_VFS = 0x4000000, /* virtual file system*/ SI_SUB_CLOCKS = 0x4800000, /* real time and stat clocks*/ SI_SUB_MBUF = 0x5000000, /* mbufs*/ SI_SUB_CLIST = 0x5800000, /* clists*/ SI_SUB_SYSV_SHM = 0x6400000, /* System V shared memory*/ SI_SUB_SYSV_SEM = 0x6800000, /* System V semaphores*/ SI_SUB_SYSV_MSG = 0x6C00000, /* System V message queues*/ SI_SUB_P1003_1B = 0x6E00000, /* P1003.1B realtime */ SI_SUB_PSEUDO = 0x7000000, /* pseudo devices*/ SI_SUB_EXEC = 0x7400000, /* execve() handlers */ SI_SUB_PROTO_BEGIN = 0x8000000, /* XXX: set splimp (kludge)*/ SI_SUB_PROTO_IF = 0x8400000, /* interfaces*/ SI_SUB_PROTO_DOMAIN = 0x8800000, /* domains (address families?)*/ SI_SUB_PROTO_END = 0x8ffffff, /* XXX: set splx (kludge)*/ SI_SUB_KPROF = 0x9000000, /* kernel profiling*/ SI_SUB_KICK_SCHEDULER = 0xa000000, /* start the timeout events*/ SI_SUB_INT_CONFIG_HOOKS = 0xa800000, /* Interrupts enabled config */ SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */ SI_SUB_DUMP_CONF = 0xb200000, /* Find dump devices */ SI_SUB_MOUNT_ROOT = 0xb400000, /* root mount*/ SI_SUB_ROOT_FDTAB = 0xb800000, /* root vnode in fd table...*/ SI_SUB_SWAP = 0xc000000, /* swap*/ SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/ SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/ SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/ SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/ + SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ SI_SUB_SMP = 0xf000000, /* idle procs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler: no return*/ }; /* * Some enumerated orders; "ANY" sorts last. */ enum sysinit_elem_order { SI_ORDER_FIRST = 0x0000000, /* first*/ SI_ORDER_SECOND = 0x0000001, /* second*/ SI_ORDER_THIRD = 0x0000002, /* third*/ SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ SI_ORDER_ANY = 0xfffffff /* last*/ }; /* * A system initialization call instance * * At the moment there is one instance of sysinit. We probably do not * want two which is why this code is if'd out, but we definitely want * to discern SYSINIT's which take non-constant data pointers and * SYSINIT's which take constant data pointers, * * The C_* macros take functions expecting const void * arguments * while the non-C_* macros take functions expecting just void * arguments. * * With -Wcast-qual on, the compiler issues warnings: * - if we pass non-const data or functions taking non-const data * to a C_* macro. * * - if we pass const data to the normal macros * * However, no warning is issued if we pass a function taking const data * through a normal non-const macro. This is ok because the function is * saying it won't modify the data so we don't care whether the data is * modifiable or not. */ typedef void (*sysinit_nfunc_t) __P((void *)); typedef void (*sysinit_cfunc_t) __P((const void *)); struct sysinit { unsigned int subsystem; /* subsystem identifier*/ unsigned int order; /* init order within subsystem*/ sysinit_cfunc_t func; /* function */ const void *udata; /* multiplexer/argument */ }; /* * Default: no special processing * * The C_ version of SYSINIT is for data pointers to const * data ( and functions taking data pointers to const data ). * At the moment it is no different from SYSINIT and thus * still results in warnings. * * The casts are necessary to have the compiler produce the * correct warnings when -Wcast-qual is used. * */ #define C_SYSINIT(uniquifier, subsystem, order, func, ident) \ static struct sysinit uniquifier ## _sys_init = { \ subsystem, \ order, \ func, \ ident \ }; \ DATA_SET(sysinit_set,uniquifier ## _sys_init); #define SYSINIT(uniquifier, subsystem, order, func, ident) \ C_SYSINIT(uniquifier, subsystem, order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)ident) /* * Called on module unload: no special processing */ #define C_SYSUNINIT(uniquifier, subsystem, order, func, ident) \ static struct sysinit uniquifier ## _sys_uninit = { \ subsystem, \ order, \ func, \ ident \ }; \ DATA_SET(sysuninit_set,uniquifier ## _sys_uninit) #define SYSUNINIT(uniquifier, subsystem, order, func, ident) \ C_SYSUNINIT(uniquifier, subsystem, order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)ident) void sysinit_add __P((struct sysinit **set)); /* * Compatibility. To be deprecated after LKM is removed. */ #include #define PSEUDO_SET(sym, name) \ static int name ## _modevent(module_t mod, int type, void *data) \ { \ void (*initfunc)(void *) = (void (*)(void *))data; \ switch (type) { \ case MOD_LOAD: \ /* printf(#name " module load\n"); */ \ initfunc(NULL); \ break; \ case MOD_UNLOAD: \ printf(#name " module unload - not possible for this module type\n"); \ return EINVAL; \ } \ return 0; \ } \ static moduledata_t name ## _mod = { \ #name, \ name ## _modevent, \ (void *)sym \ }; \ DECLARE_MODULE(name, name ## _mod, SI_SUB_PSEUDO, SI_ORDER_ANY) extern struct linker_set execsw_set; #endif /* !_SYS_KERNEL_H_*/ diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 8bc314792b7e..dcecdc99a352 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1,410 +1,410 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 - * $Id: proc.h,v 1.83 1999/06/30 15:33:41 peter Exp $ + * $Id: proc.h,v 1.84 1999/07/01 13:21:45 peter Exp $ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* Machine-dependent proc substruct. */ #include /* For struct callout_handle. */ #include #include #include /* For struct rtprio. */ #include #ifndef KERNEL #include /* For structs itimerval, timeval. */ #endif #include /* * One structure allocated per session. */ struct session { int s_count; /* Ref cnt; pgrps in session. */ struct proc *s_leader; /* Session leader. */ struct vnode *s_ttyvp; /* Vnode of controlling terminal. */ struct tty *s_ttyp; /* Controlling terminal. */ pid_t s_sid; /* Session ID */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; /* Setlogin() name. */ }; /* * One structure allocated per process group. */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* Hash chain. */ LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members. */ struct session *pg_session; /* Pointer to session. */ struct sigiolst pg_sigiolst; /* List of sigio sources. */ pid_t pg_id; /* Pgrp id. */ int pg_jobc; /* # procs qualifying pgrp for job control */ }; struct procsig { sigset_t ps_sigignore; /* Signals being ignored. */ sigset_t ps_sigcatch; /* Signals being caught by user. */ int ps_flag; struct sigacts *ps_sigacts; int ps_refcnt; }; /* * pasleep structure, used by asleep() syscall to hold requested priority * and timeout values for await(). */ struct pasleep { int as_priority; /* Async priority. */ int as_timo; /* Async timeout. */ }; /* * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(PROC ONLY)" below, * which might be addressable only on a processor on which the process * is running. */ struct jail; struct proc { TAILQ_ENTRY(proc) p_procq; /* run/sleep queue. */ LIST_ENTRY(proc) p_list; /* List of all processes. */ /* substructures: */ struct pcred *p_cred; /* Process owner's identity. */ struct filedesc *p_fd; /* Ptr to open files structure. */ struct pstats *p_stats; /* Accounting/statistics (PROC ONLY). */ struct plimit *p_limit; /* Process limits. */ struct vm_object *p_upages_obj;/* Upages object */ struct procsig *p_procsig; #define p_sigacts p_procsig->ps_sigacts #define p_sigignore p_procsig->ps_sigignore #define p_sigcatch p_procsig->ps_sigcatch #define p_ucred p_cred->pc_ucred #define p_rlimit p_limit->pl_rlimit int p_flag; /* P_* flags. */ char p_stat; /* S* process status. */ char p_pad1[3]; pid_t p_pid; /* Process identifier. */ LIST_ENTRY(proc) p_hash; /* Hash chain. */ LIST_ENTRY(proc) p_pglist; /* List of processes in pgrp. */ struct proc *p_pptr; /* Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* List of sibling processes. */ LIST_HEAD(, proc) p_children; /* Pointer to list of children. */ struct callout_handle p_ithandle; /* * Callout handle for scheduling * p_realtimer. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* Save parent pid during ptrace. XXX */ int p_dupfd; /* Sideways return value from fdopen. XXX */ struct vmspace *p_vmspace; /* Address space. */ /* scheduling */ u_int p_estcpu; /* Time averaged value of p_cpticks. */ int p_cpticks; /* Ticks of cpu time. */ fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */ void *p_wchan; /* Sleep address. */ const char *p_wmesg; /* Reason for sleep. */ u_int p_swtime; /* Time swapped in or out. */ u_int p_slptime; /* Time since last blocked. */ struct itimerval p_realtimer; /* Alarm timer. */ u_int64_t p_runtime; /* Real time in microsec. */ u_quad_t p_uticks; /* Statclock hits in user mode. */ u_quad_t p_sticks; /* Statclock hits in system mode. */ u_quad_t p_iticks; /* Statclock hits processing intr. */ int p_traceflag; /* Kernel trace points. */ struct vnode *p_tracep; /* Trace to vnode. */ int p_siglist; /* Signals arrived but not delivered. */ struct vnode *p_textvp; /* Vnode of executable. */ char p_lock; /* Process lock (prevent swap) count. */ u_char p_oncpu; /* Which cpu we are on */ u_char p_lastcpu; /* Last cpu we were on */ char p_pad2; /* alignment */ short p_locks; /* DEBUG: lockmgr count of held locks */ short p_simple_locks; /* DEBUG: count of held simple locks */ unsigned int p_stops; /* procfs event bitmask */ unsigned int p_stype; /* procfs stop event type */ char p_step; /* procfs stop *once* flag */ unsigned char p_pfsflags; /* procfs flags */ char p_pad3[2]; /* padding for alignment */ register_t p_retval[2]; /* syscall aux returns */ struct sigiolst p_sigiolst; /* list of sigio sources */ int p_sigparent; /* signal to parent on exit */ sigset_t p_oldsigmask; /* saved mask from before sigpause */ int p_sig; /* for core dump/debugger XXX */ u_long p_code; /* for core dump/debugger XXX */ /* End area that is zeroed on creation. */ #define p_endzero p_startcopy /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_sigmask sigset_t p_sigmask; /* Current signal mask. */ u_char p_priority; /* Process priority. */ u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ char p_nice; /* Process "nice" value. */ char p_comm[MAXCOMLEN+1]; struct pgrp *p_pgrp; /* Pointer to process group. */ struct sysentvec *p_sysent; /* System call dispatch information. */ struct rtprio p_rtprio; /* Realtime priority. */ struct prison *p_prison; /* End area that is copied on creation. */ #define p_endcopy p_addr struct user *p_addr; /* Kernel virtual addr of u-area (PROC ONLY). */ struct mdproc p_md; /* Any machine-dependent fields. */ u_short p_xstat; /* Exit status for wait; also stop signal. */ u_short p_acflag; /* Accounting flags. */ struct rusage *p_ru; /* Exit information. XXX */ int p_nthreads; /* number of threads (only in leader) */ void *p_aioinfo; /* ASYNC I/O info */ int p_wakeup; /* thread id */ struct proc *p_peers; struct proc *p_leader; struct pasleep p_asleep; /* Used by asleep()/await(). */ void *p_emuldata; /* process-specific emulator state data */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id /* Status values. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ /* These flags are kept in p_flags. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_INMEM 0x00004 /* Loaded into memory. */ #define P_NOCLDSTOP 0x00008 /* No SIGCHLD when children stop. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_SELECT 0x00040 /* Selecting; wakeup/waiting danger. */ #define P_SINTR 0x00080 /* Sleep is interruptible. */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_TIMEOUT 0x00400 /* Timing out during sleep. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Debugging process has waited for child. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ /* Should probably be changed into a hold count. */ /* was P_NOSWAP 0x08000 was: Do not swap upages; p->p_hold */ /* was P_PHYSIO 0x10000 was: Doing physical I/O; use p->p_hold */ /* Should be moved to machine-dependent areas. */ #define P_OWEUPC 0x20000 /* Owe process an addupc() call at next ast. */ #define P_SWAPPING 0x40000 /* Process is being swapped. */ #define P_SWAPINREQ 0x80000 /* Swapin request due to wakeup */ /* Marked a kernel thread */ -#define P_FLSINPROG 0x100000 /* dirty buffers flush is in progress */ +#define P_BUFEXHAUST 0x100000 /* dirty buffers flush is in progress */ #define P_KTHREADP 0x200000 /* Process is really a kernel thread */ #define P_NOCLDWAIT 0x400000 /* No zombies if child dies */ #define P_DEADLKTREAT 0x800000 /* lock aquisition - deadlock treatment */ #define P_JAILED 0x1000000 /* Process is in jail */ /* * MOVE TO ucred.h? * * Shareable process credentials (always resident). This includes a reference * to the current user credentials as well as real and saved ids that may be * used to change ids. */ struct pcred { struct ucred *pc_ucred; /* Current credentials. */ uid_t p_ruid; /* Real user id. */ uid_t p_svuid; /* Saved effective user id. */ gid_t p_rgid; /* Real group id. */ gid_t p_svgid; /* Saved effective group id. */ int p_refcnt; /* Number of references. */ }; struct prochd { struct proc *ph_link; /* Linked list of running processes. */ struct proc *ph_rlink; }; #ifdef KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif /* flags for suser_xxx() */ #define PRISON_ROOT 1 /* Handy macro to determine of p1 can mangle p2 */ #define PRISON_CHECK(p1, p2) \ ((!(p1)->p_prison) || (p1)->p_prison == (p2)->p_prison) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) { \ if (--(s)->s_count == 0) \ FREE(s, M_SESSION); \ } extern void stopevent(struct proc*, unsigned int, unsigned int); #define STOPEVENT(p,e,v) do { \ if ((p)->p_stops & (e)) stopevent(p,e,v); } while (0) /* hold process U-area in memory, normally for ptrace/procfs work */ #define PHOLD(p) { \ if ((p)->p_lock++ == 0 && ((p)->p_flag & P_INMEM) == 0) \ faultin(p); \ } #define PRELE(p) (--(p)->p_lock) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; #ifndef SET_CURPROC #define SET_CURPROC(p) (curproc = (p)) #endif #ifndef curproc extern struct proc *curproc; /* Current running proc. */ extern int switchticks; /* `ticks' at last context switch. */ extern struct timeval switchtime; /* Uptime at last context switch */ #endif extern struct proc proc0; /* Process slot for swapper. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern int sched_quantum; /* Scheduling quantum in ticks */ LIST_HEAD(proclist, proc); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc, *updateproc; /* Process slots for init, pager. */ #define NQS 32 /* 32 run queues. */ extern struct prochd qs[]; extern struct prochd rtqs[]; extern struct prochd idqs[]; extern int whichqs; /* Bit mask summary of non-empty Q's. */ extern int whichrtqs; /* Bit mask summary of non-empty Q's. */ extern int whichidqs; /* Bit mask summary of non-empty Q's. */ struct proc *pfind __P((pid_t)); /* Find process by id. */ struct pgrp *pgfind __P((pid_t)); /* Find process group by id. */ struct vm_zone; extern struct vm_zone *proc_zone; int chgproccnt __P((uid_t uid, int diff)); int enterpgrp __P((struct proc *p, pid_t pgid, int mksess)); void fixjobc __P((struct proc *p, struct pgrp *pgrp, int entering)); int inferior __P((struct proc *p)); int leavepgrp __P((struct proc *p)); void mi_switch __P((void)); void procinit __P((void)); void resetpriority __P((struct proc *)); int roundrobin_interval __P((void)); void setrunnable __P((struct proc *)); void setrunqueue __P((struct proc *)); void sleepinit __P((void)); int suser __P((struct proc *)); int suser_xxx __P((struct ucred *cred, struct proc *proc, int flag)); void remrq __P((struct proc *)); void cpu_switch __P((struct proc *)); void unsleep __P((struct proc *)); void wakeup_one __P((void *chan)); void cpu_exit __P((struct proc *)) __dead2; void exit1 __P((struct proc *, int)) __dead2; void cpu_fork __P((struct proc *, struct proc *)); void cpu_set_fork_handler __P((struct proc *, void (*)(void *), void *)); int fork1 __P((struct proc *, int, struct proc **)); int trace_req __P((struct proc *)); void cpu_wait __P((struct proc *)); int cpu_coredump __P((struct proc *, struct vnode *, struct ucred *)); void setsugid __P((struct proc *p)); void faultin __P((struct proc *p)); #endif /* KERNEL */ #endif /* !_SYS_PROC_H_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 414c922d1573..90406fae94c5 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1,1461 +1,1459 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pageout.c,v 1.142 1999/06/26 14:56:58 peter Exp $ + * $Id: vm_pageout.c,v 1.143 1999/07/01 13:21:46 peter Exp $ */ /* * The proverbial page-out daemon. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout __P((void)); static int vm_pageout_clean __P((vm_page_t)); static int vm_pageout_scan __P((void)); static int vm_pageout_free_page_calc __P((vm_size_t count)); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon __P((void)); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) #endif int vm_pages_needed=0; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit=0; /* Estimated number of pages deficit */ int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */ extern int npendingio; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; #endif extern int nswiodone; extern int vm_swap_size; static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; static int vm_pageout_full_stats_interval = 0; static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0; static int defer_swap_pageouts=0; static int disable_swap_pageouts=0; static int max_page_launder=100; #if defined(NO_SWAPPING) static int vm_swap_enabled=0; static int vm_swap_idle_enabled=0; #else static int vm_swap_enabled=1; static int vm_swap_idle_enabled=0; #endif SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, ""); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); SYSCTL_INT(_vm, OID_AUTO, max_page_launder, CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ #if !defined(NO_SWAPPING) typedef void freeer_fcn_t __P((vm_map_t, vm_object_t, vm_pindex_t, int)); static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_pindex_t)); static freeer_fcn_t vm_pageout_object_deactivate_pages; static void vm_req_vmdaemon __P((void)); #endif static void vm_pageout_page_stats(void); /* * vm_pageout_clean: * * Clean the page and remove it from the laundry. * * We set the busy bit to cause potential page faults on this page to * block. Note the careful timing, however, the busy bit isn't set till * late and we cannot do anything that will mess with the page. */ static int vm_pageout_clean(m) vm_page_t m; { register vm_object_t object; vm_page_t mc[2*vm_pageout_page_count]; int pageout_count; int i, forward_okay, backward_okay, page_base; vm_pindex_t pindex = m->pindex; object = m->object; /* * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP * with the new swapper, but we could have serious problems paging * out other object types if there is insufficient memory. * * Unfortunately, checking free memory here is far too late, so the * check has been moved up a procedural level. */ #if 0 /* * If not OBJT_SWAP, additional memory may be needed to do the pageout. * Try to avoid the deadlock. */ if ((object->type == OBJT_DEFAULT) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)) return 0; #endif /* * Don't mess with the page if it's busy. */ if ((m->hold_count != 0) || ((m->busy != 0) || (m->flags & PG_BUSY))) return 0; #if 0 /* * XXX REMOVED XXX. vm_object_collapse() can block, which can * change the page state. Calling vm_object_collapse() might also * destroy or rename the page because we have not busied it yet!!! * So this code segment is removed. */ /* * Try collapsing before it's too late. XXX huh? Why are we doing * this here? */ if (object->backing_object) { vm_object_collapse(object); } #endif mc[vm_pageout_page_count] = m; pageout_count = 1; page_base = vm_pageout_page_count; forward_okay = TRUE; if (pindex != 0) backward_okay = TRUE; else backward_okay = FALSE; /* * Scan object for clusterable pages. * * We can cluster ONLY if: ->> the page is NOT * clean, wired, busy, held, or mapped into a * buffer, and one of the following: * 1) The page is inactive, or a seldom used * active page. * -or- * 2) we force the issue. */ for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) { vm_page_t p; /* * See if forward page is clusterable. */ if (forward_okay) { /* * Stop forward scan at end of object. */ if ((pindex + i) > object->size) { forward_okay = FALSE; goto do_backward; } p = vm_page_lookup(object, pindex + i); if (p) { if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { forward_okay = FALSE; goto do_backward; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && (p->queue == PQ_INACTIVE) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[vm_pageout_page_count + i] = p; pageout_count++; if (pageout_count == vm_pageout_page_count) break; } else { forward_okay = FALSE; } } else { forward_okay = FALSE; } } do_backward: /* * See if backward page is clusterable. */ if (backward_okay) { /* * Stop backward scan at beginning of object. */ if ((pindex - i) == 0) { backward_okay = FALSE; } p = vm_page_lookup(object, pindex - i); if (p) { if (((p->queue - p->pc) == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) { backward_okay = FALSE; continue; } vm_page_test_dirty(p); if ((p->dirty & p->valid) != 0 && (p->queue == PQ_INACTIVE) && (p->wire_count == 0) && (p->hold_count == 0)) { mc[vm_pageout_page_count - i] = p; pageout_count++; page_base--; if (pageout_count == vm_pageout_page_count) break; } else { backward_okay = FALSE; } } else { backward_okay = FALSE; } } } /* * we allow reads during pageouts... */ return vm_pageout_flush(&mc[page_base], pageout_count, 0); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. */ int vm_pageout_flush(mc, count, flags) vm_page_t *mc; int count; int flags; { register vm_object_t object; int pageout_status[count]; int numpagedout = 0; int i; /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. */ for (i = 0; i < count; i++) { vm_page_io_start(mc[i]); vm_page_protect(mc[i], VM_PROT_READ); } object = mc[0]->object; vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, (flags | ((object == kernel_object) ? OBJPC_SYNC : 0)), pageout_status); for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; switch (pageout_status[i]) { case VM_PAGER_OK: numpagedout++; break; case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); mt->dirty = 0; break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_activate(mt); break; case VM_PAGER_AGAIN: break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_io_finish(mt); } } return numpagedout; } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * deactivate enough pages to satisfy the inactive target * requirements or if vm_page_proc_limit is set, then * deactivate all of the pages in the object and its * backing_objects. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only) vm_map_t map; vm_object_t object; vm_pindex_t desired; int map_remove_only; { register vm_page_t p, next; int rcount; int remove_mode; int s; if (object->type == OBJT_DEVICE) return; while (object) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) return; if (object->paging_in_progress) return; remove_mode = map_remove_only; if (object->shadow_count > 1) remove_mode = 1; /* * scan the objects entire memory queue */ rcount = object->resident_page_count; p = TAILQ_FIRST(&object->memq); while (p && (rcount-- > 0)) { int actcount; if (pmap_resident_count(vm_map_pmap(map)) <= desired) return; next = TAILQ_NEXT(p, listq); cnt.v_pdpages++; if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 || (p->flags & PG_BUSY) || !pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) { p = next; continue; } actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(p)); if (actcount) { vm_page_flag_set(p, PG_REFERENCED); } else if (p->flags & PG_REFERENCED) { actcount = 1; } if ((p->queue != PQ_ACTIVE) && (p->flags & PG_REFERENCED)) { vm_page_activate(p); p->act_count += actcount; vm_page_flag_clear(p, PG_REFERENCED); } else if (p->queue == PQ_ACTIVE) { if ((p->flags & PG_REFERENCED) == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) { vm_page_protect(p, VM_PROT_NONE); vm_page_deactivate(p); } else { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); splx(s); } } else { vm_page_activate(p); vm_page_flag_clear(p, PG_REFERENCED); if (p->act_count < (ACT_MAX - ACT_ADVANCE)) p->act_count += ACT_ADVANCE; s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, p, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq); splx(s); } } else if (p->queue == PQ_INACTIVE) { vm_page_protect(p, VM_PROT_NONE); } p = next; } object = object->backing_object; } return; } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; vm_pindex_t desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, (void *)0, curproc)) { return; } bigobj = NULL; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if ((obj != NULL) && (obj->shadow_count <= 1) && ((bigobj == NULL) || (bigobj->resident_page_count < obj->resident_page_count))) { bigobj = obj; } } tmpe = tmpe->next; } if (bigobj) vm_pageout_object_deactivate_pages(map, bigobj, desired, 0); /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj) vm_pageout_object_deactivate_pages(map, obj, desired, 0); } tmpe = tmpe->next; }; /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0) pmap_remove(vm_map_pmap(map), VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); vm_map_unlock(map); return; } #endif /* * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore * to vnode deadlocks. We only do it for OBJT_DEFAULT and OBJT_SWAP objects * which we know can be trivially freed. */ void vm_pageout_page_free(vm_page_t m) { vm_object_t object = m->object; int type = object->type; if (type == OBJT_SWAP || type == OBJT_DEFAULT) vm_object_reference(object); vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); if (type == OBJT_SWAP || type == OBJT_DEFAULT) vm_object_deallocate(object); } /* * vm_pageout_scan does the dirty work for the pageout daemon. */ static int vm_pageout_scan() { vm_page_t m, next; int page_shortage, maxscan, pcount; int addl_page_shortage, addl_page_shortage_init; int maxlaunder; int launder_loop = 0; struct proc *p, *bigproc; vm_offset_t size, bigsize; vm_object_t object; int force_wakeup = 0; int actcount; int vnodes_skipped = 0; int s; /* * Do whatever cleanup that the pmap code can. */ pmap_collect(); addl_page_shortage_init = vm_pageout_deficit; vm_pageout_deficit = 0; if (max_page_launder == 0) max_page_launder = 1; /* * Calculate the number of pages we want to either free or move * to the cache. */ page_shortage = (cnt.v_free_target + cnt.v_cache_min) - (cnt.v_free_count + cnt.v_cache_count); page_shortage += addl_page_shortage_init; /* * Figure out what to do with dirty pages when they are encountered. * Assume that 1/3 of the pages on the inactive list are clean. If * we think we can reach our target, disable laundering (do not * clean any dirty pages). If we miss the target we will loop back * up and do a laundering run. */ if (cnt.v_inactive_count / 3 > page_shortage) { maxlaunder = 0; launder_loop = 0; } else { maxlaunder = (cnt.v_inactive_target > max_page_launder) ? max_page_launder : cnt.v_inactive_target; launder_loop = 1; } /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or * we have scanned the entire inactive queue. */ rescan0: addl_page_shortage = addl_page_shortage_init; maxscan = cnt.v_inactive_count; - for ( - m = TAILQ_FIRST(&vm_page_queue_inactive); - m != NULL && maxscan-- > 0 && page_shortage > 0; - m = next - ) { + for (m = TAILQ_FIRST(&vm_page_queue_inactive); + m != NULL && maxscan-- > 0 && page_shortage > 0; + m = next) { cnt.v_pdpages++; if (m->queue != PQ_INACTIVE) { goto rescan0; } next = TAILQ_NEXT(m, pageq); if (m->hold_count) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); addl_page_shortage++; continue; } /* * Dont mess with busy pages, keep in the front of the * queue, most likely are being paged out. */ if (m->busy || (m->flags & PG_BUSY)) { addl_page_shortage++; continue; } /* * If the object is not being used, we ignore previous * references. */ if (m->object->ref_count == 0) { vm_page_flag_clear(m, PG_REFERENCED); pmap_clear_reference(VM_PAGE_TO_PHYS(m)); /* * Otherwise, if the page has been referenced while in the * inactive queue, we bump the "activation count" upwards, * making it less likely that the page will be added back to * the inactive queue prematurely again. Here we check the * page tables (or emulated bits, if any), given the upper * level VM system not knowing anything about existing * references. */ } else if (((m->flags & PG_REFERENCED) == 0) && (actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) { vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE); continue; } /* * If the upper level VM system knows about any page * references, we activate the page. We also set the * "activation count" higher than normal so that we will less * likely place pages back onto the inactive queue again. */ if ((m->flags & PG_REFERENCED) != 0) { vm_page_flag_clear(m, PG_REFERENCED); actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)); vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE + 1); continue; } /* * If the upper level VM system doesn't know anything about * the page being dirty, we have to check for it again. As * far as the VM code knows, any partially dirty pages are * fully dirty. */ if (m->dirty == 0) { vm_page_test_dirty(m); } else { vm_page_dirty(m); } /* * Invalid pages can be easily freed */ if (m->valid == 0) { vm_pageout_page_free(m); cnt.v_dfree++; --page_shortage; /* * Clean pages can be placed onto the cache queue. */ } else if (m->dirty == 0) { vm_page_cache(m); --page_shortage; /* * Dirty pages need to be paged out. Note that we clean * only a limited number of pages per pagedaemon pass. */ } else if (maxlaunder > 0) { int written; int swap_pageouts_ok; struct vnode *vp = NULL; object = m->object; if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { swap_pageouts_ok = 1; } else { swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min); } /* * We don't bother paging objects that are "dead". * Those objects are in a "rundown" state. */ if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); continue; } /* * For now we protect against potential memory * deadlocks by requiring significant memory to be * free if the object is not OBJT_DEFAULT or OBJT_SWAP. * We do not 'trust' any other object type to operate * with low memory, not even OBJT_DEVICE. The VM * allocator will special case allocations done by * the pageout daemon so the check below actually * does have some hysteresis in it. It isn't the best * solution, though. */ - if ( - object->type != OBJT_DEFAULT && + if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && - cnt.v_free_count < cnt.v_free_reserved - ) { + cnt.v_free_count < cnt.v_free_reserved) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); - TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, + pageq); splx(s); continue; } /* * Presumably we have sufficient free memory to do * the more sophisticated checks and locking required * for vnodes. * * The object is already known NOT to be dead. The * vget() may still block, though, because * VOP_ISLOCKED() doesn't check to see if an inode * (v_data) is associated with the vnode. If it isn't, * vget() will load in it from disk. Worse, vget() * may actually get stuck waiting on "inode" if another * process is in the process of bringing the inode in. * This is bad news for us either way. * * So for the moment we check v_data == NULL as a * workaround. This means that vnodes which do not * use v_data in the way we expect probably will not * wind up being paged out by the pager and it will be * up to the syncer to get them. That's better then * us blocking here. * * This whole code section is bogus - we need to fix * the vnode pager to handle vm_page_t's without us * having to do any sophisticated VOP tests. */ if (object->type == OBJT_VNODE) { vp = object->handle; if (VOP_ISLOCKED(vp) || vp->v_data == NULL || vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) { if ((m->queue == PQ_INACTIVE) && (m->hold_count == 0) && (m->busy == 0) && (m->flags & PG_BUSY) == 0) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); } if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; continue; } /* * The page might have been moved to another queue * during potential blocking in vget() above. */ if (m->queue != PQ_INACTIVE) { if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; vput(vp); continue; } /* * The page may have been busied during the blocking in * vput(); We don't move the page back onto the end of * the queue so that statistics are more correct if we don't. */ if (m->busy || (m->flags & PG_BUSY)) { vput(vp); continue; } /* * If the page has become held, then skip it */ if (m->hold_count) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); splx(s); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; vput(vp); continue; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ written = vm_pageout_clean(m); if (vp) vput(vp); maxlaunder -= written; } } /* * If we still have a page shortage and we didn't launder anything, * run the inactive scan again and launder something this time. */ if (launder_loop == 0 && page_shortage > 0) { launder_loop = 1; maxlaunder = (cnt.v_inactive_target > max_page_launder) ? max_page_launder : cnt.v_inactive_target; goto rescan0; } /* * Compute the page shortage from the point of view of having to * move pages from the active queue to the inactive queue. */ page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); page_shortage += addl_page_shortage; /* * Scan the active queue for things we can deactivate */ pcount = cnt.v_active_count; m = TAILQ_FIRST(&vm_page_queue_active); while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) { /* * This is a consistancy check, and should likely be a panic * or warning. */ if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); m = next; continue; } /* * The count for pagedaemon pages is done after checking the * page for eligbility... */ cnt.v_pdpages++; /* * Check to see "how much" the page has been used. */ actcount = 0; if (m->object->ref_count != 0) { if (m->flags & PG_REFERENCED) { actcount += 1; } actcount += pmap_ts_referenced(VM_PAGE_TO_PHYS(m)); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } } /* * Since we have "tested" this bit, we need to clear it now. */ vm_page_flag_clear(m, PG_REFERENCED); /* * Only if an object is currently being used, do we use the * page activation count stats. */ if (actcount && (m->object->ref_count != 0)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } else { m->act_count -= min(m->act_count, ACT_DECLINE); if (vm_pageout_algorithm_lru || (m->object->ref_count == 0) || (m->act_count == 0)) { page_shortage--; if (m->object->ref_count == 0) { vm_page_protect(m, VM_PROT_NONE); if (m->dirty == 0) vm_page_cache(m); else vm_page_deactivate(m); } else { vm_page_deactivate(m); } } else { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } } m = next; } s = splvm(); /* * We try to maintain some *really* free pages, this allows interrupt * code to be guaranteed space. Since both cache and free queues * are considered basically 'free', moving pages from cache to free * does not effect other calculations. */ while (cnt.v_free_count < cnt.v_free_reserved) { static int cache_rover = 0; m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE); if (!m) break; if ((m->flags & PG_BUSY) || m->busy || m->hold_count || m->wire_count) { #ifdef INVARIANTS printf("Warning: busy page %p found in cache\n", m); #endif vm_page_deactivate(m); continue; } cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK; vm_pageout_page_free(m); cnt.v_dfree++; } splx(s); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second. */ if (vm_swap_idle_enabled) { static long lsec; if (time_second != lsec) { vm_pageout_req_swapout |= VM_SWAP_IDLE; vm_req_vmdaemon(); lsec = time_second; } } #endif /* * If we didn't get enough free pages, and we have skipped a vnode * in a writeable object, wakeup the sync daemon. And kick swapout * if we did not get enough free pages. */ if ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_free_target + cnt.v_cache_min) ) { if (vnodes_skipped && (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) { (void) speedup_syncer(); } #if !defined(NO_SWAPPING) if (vm_swap_enabled && (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) { vm_req_vmdaemon(); vm_pageout_req_swapout |= VM_SWAP_NORMAL; } #endif } /* * make sure that we have swap space -- if we are low on memory and * swap -- then kill the biggest process. */ if ((vm_swap_size == 0 || swap_pager_full) && ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) { bigproc = NULL; bigsize = 0; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { /* * if this is a system process, skip it */ if ((p->p_flag & P_SYSTEM) || (p->p_lock > 0) || (p->p_pid == 1) || ((p->p_pid < 48) && (vm_swap_size != 0))) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get the process size */ size = vmspace_resident_count(p->p_vmspace); /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { bigproc = p; bigsize = size; } } if (bigproc != NULL) { killproc(bigproc, "out of swap space"); bigproc->p_estcpu = 0; bigproc->p_nice = PRIO_MIN; resetpriority(bigproc); wakeup(&cnt.v_free_count); } } return force_wakeup; } /* * This routine tries to maintain the pseudo LRU active queue, * so that during long periods of time where there is no paging, * that some statistic accumlation still occurs. This code * helps the situation where paging just starts to occur. */ static void vm_pageout_page_stats() { int s; vm_page_t m,next; int pcount,tpcount; /* Number of pages to check */ static int fullintervalcount = 0; int page_shortage; page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) - (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count); if (page_shortage <= 0) return; pcount = cnt.v_active_count; fullintervalcount += vm_pageout_stats_interval; if (fullintervalcount < vm_pageout_full_stats_interval) { tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count; if (pcount > tpcount) pcount = tpcount; } m = TAILQ_FIRST(&vm_page_queue_active); while ((m != NULL) && (pcount-- > 0)) { int actcount; if (m->queue != PQ_ACTIVE) { break; } next = TAILQ_NEXT(m, pageq); /* * Don't deactivate pages that are busy. */ if ((m->busy != 0) || (m->flags & PG_BUSY) || (m->hold_count != 0)) { s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); m = next; continue; } actcount = 0; if (m->flags & PG_REFERENCED) { vm_page_flag_clear(m, PG_REFERENCED); actcount += 1; } actcount += pmap_ts_referenced(VM_PAGE_TO_PHYS(m)); if (actcount) { m->act_count += ACT_ADVANCE + actcount; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } else { if (m->act_count == 0) { /* * We turn off page access, so that we have more accurate * RSS stats. We don't do this in the normal page deactivation * when the system is loaded VM wise, because the cost of * the large number of page protect operations would be higher * than the value of doing the operation. */ vm_page_protect(m, VM_PROT_NONE); vm_page_deactivate(m); } else { m->act_count -= min(m->act_count, ACT_DECLINE); s = splvm(); TAILQ_REMOVE(&vm_page_queue_active, m, pageq); TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); splx(s); } } m = next; } } static int vm_pageout_free_page_calc(count) vm_size_t count; { if (count < cnt.v_page_count) return 0; /* * free_reserved needs to include enough for the largest swap pager * structures plus enough for any pv_entry structs when paging. */ if (cnt.v_page_count > 1024) cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; else cnt.v_free_min = 4; cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + cnt.v_interrupt_free_min; cnt.v_free_reserved = vm_pageout_page_count + cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE; cnt.v_free_min += cnt.v_free_reserved; return 1; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout() { /* * Initialize some paging parameters. */ cnt.v_interrupt_free_min = 2; if (cnt.v_page_count < 2000) vm_pageout_page_count = 8; vm_pageout_free_page_calc(cnt.v_page_count); /* * free_reserved needs to include enough for the largest swap pager * structures plus enough for any pv_entry structs when paging. */ if (cnt.v_free_count > 6144) cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved; else cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved; if (cnt.v_free_count > 2048) { cnt.v_cache_min = cnt.v_free_target; cnt.v_cache_max = 2 * cnt.v_cache_min; cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; } else { cnt.v_cache_min = 0; cnt.v_cache_max = 0; cnt.v_inactive_target = cnt.v_free_count / 4; } if (cnt.v_inactive_target > cnt.v_free_count / 3) cnt.v_inactive_target = cnt.v_free_count / 3; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; if (vm_pageout_stats_max == 0) vm_pageout_stats_max = cnt.v_free_target; /* * Set interval in seconds for stats scan. */ if (vm_pageout_stats_interval == 0) vm_pageout_stats_interval = 5; if (vm_pageout_full_stats_interval == 0) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; /* * Set maximum free per pass */ if (vm_pageout_stats_free_max == 0) vm_pageout_stats_free_max = 5; max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16); + curproc->p_flag |= P_BUFEXHAUST; swap_pager_swap_init(); /* * The pageout daemon is never done, so loop forever. */ while (TRUE) { int error; int s = splvm(); if (!vm_pages_needed || ((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) { vm_pages_needed = 0; error = tsleep(&vm_pages_needed, PVM, "psleep", vm_pageout_stats_interval * hz); if (error && !vm_pages_needed) { splx(s); vm_pageout_page_stats(); continue; } } else if (vm_pages_needed) { vm_pages_needed = 0; tsleep(&vm_pages_needed, PVM, "psleep", hz/2); } if (vm_pages_needed) cnt.v_pdwakeups++; vm_pages_needed = 0; splx(s); vm_pageout_scan(); vm_pageout_deficit = 0; wakeup(&cnt.v_free_count); } } void pagedaemon_wakeup() { if (!vm_pages_needed && curproc != pageproc) { vm_pages_needed++; wakeup(&vm_pages_needed); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon() { static int lastrun = 0; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } } static void vm_daemon() { struct proc *p; while (TRUE) { tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0); if (vm_pageout_req_swapout) { swapout_procs(vm_pageout_req_swapout); vm_pageout_req_swapout = 0; } /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ if (p->p_flag & (P_SYSTEM | P_WEXIT)) { continue; } /* * if the process is in a non-running type state, * don't touch it. */ if (p->p_stat != SRUN && p->p_stat != SSLEEP) { continue; } /* * get a limit */ limit = OFF_TO_IDX( qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, p->p_rlimit[RLIMIT_RSS].rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ size = vmspace_resident_count(p->p_vmspace); if (limit >= 0 && size >= limit) { vm_pageout_map_deactivate_pages( &p->p_vmspace->vm_map, limit); } } } } #endif diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 2a0dbc8ce87c..1895d4fedac7 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -1,603 +1,605 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * - * $Id: vm_pager.c,v 1.48 1999/06/26 02:46:48 mckusick Exp $ + * $Id: vm_pager.c,v 1.49 1999/06/27 11:44:22 peter Exp $ */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data"); extern struct pagerops defaultpagerops; extern struct pagerops swappagerops; extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int)); static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t)); static void dead_pager_putpages __P((vm_object_t, vm_page_t *, int, int, int *)); static boolean_t dead_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *)); static void dead_pager_dealloc __P((vm_object_t)); static int dead_pager_getpages(obj, ma, count, req) vm_object_t obj; vm_page_t *ma; int count; int req; { return VM_PAGER_FAIL; } static vm_object_t dead_pager_alloc(handle, size, prot, off) void *handle; vm_ooffset_t size; vm_prot_t prot; vm_ooffset_t off; { return NULL; } static void dead_pager_putpages(object, m, count, flags, rtvals) vm_object_t object; vm_page_t *m; int count; int flags; int *rtvals; { int i; for (i = 0; i < count; i++) { rtvals[i] = VM_PAGER_AGAIN; } } static int dead_pager_haspage(object, pindex, prev, next) vm_object_t object; vm_pindex_t pindex; int *prev; int *next; { if (prev) *prev = 0; if (next) *next = 0; return FALSE; } static void dead_pager_dealloc(object) vm_object_t object; { return; } static struct pagerops deadpagerops = { NULL, dead_pager_alloc, dead_pager_dealloc, dead_pager_getpages, dead_pager_putpages, dead_pager_haspage, NULL }; struct pagerops *pagertab[] = { &defaultpagerops, /* OBJT_DEFAULT */ &swappagerops, /* OBJT_SWAP */ &vnodepagerops, /* OBJT_VNODE */ &devicepagerops, /* OBJT_DEVICE */ &deadpagerops /* OBJT_DEAD */ }; int npagers = sizeof(pagertab) / sizeof(pagertab[0]); /* * Kernel address space for mapping pages. * Used by pagers where KVAs are needed for IO. * * XXX needs to be large enough to support the number of pending async * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size * (MAXPHYS == 64k) if you want to get the most efficiency. */ #define PAGER_MAP_SIZE (8 * 1024 * 1024) int pager_map_size = PAGER_MAP_SIZE; vm_map_t pager_map; static int bswneeded; static vm_offset_t swapbkva; /* swap buffers kva */ void vm_pager_init() { struct pagerops **pgops; /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops && ((*pgops)->pgo_init != NULL)) (*(*pgops)->pgo_init) (); } void vm_pager_bufferinit() { struct buf *bp; int i; bp = swbuf; /* * Now set up swap and physical I/O buffer headers. */ for (i = 0; i < nswbuf; i++, bp++) { TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; } cluster_pbuf_freecnt = nswbuf / 2; swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS); if (!swapbkva) panic("Not enough pager_map VM space for physical buffers"); } /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off) { struct pagerops *ops; ops = pagertab[type]; if (ops) return ((*ops->pgo_alloc) (handle, size, prot, off)); return (NULL); } void vm_pager_deallocate(object) vm_object_t object; { (*pagertab[object->type]->pgo_dealloc) (object); } /* * vm_pager_strategy: * * called with no specific spl * Execute strategy routine directly to pager. */ void vm_pager_strategy(vm_object_t object, struct buf *bp) { if (pagertab[object->type]->pgo_strategy) { (*pagertab[object->type]->pgo_strategy)(object, bp); } else { bp->b_flags |= B_ERROR; bp->b_error = ENXIO; biodone(bp); } } /* * vm_pager_get_pages() - inline, see vm/vm_pager.h * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h * vm_pager_page_inserted() - inline, see vm/vm_pager.h * vm_pager_page_removed() - inline, see vm/vm_pager.h */ #if 0 /* * vm_pager_sync: * * Called by pageout daemon before going back to sleep. * Gives pagers a chance to clean up any completed async pageing * operations. */ void vm_pager_sync() { struct pagerops **pgops; for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) if (pgops && ((*pgops)->pgo_sync != NULL)) (*(*pgops)->pgo_sync) (); } #endif vm_offset_t vm_pager_map_page(m) vm_page_t m; { vm_offset_t kva; kva = kmem_alloc_wait(pager_map, PAGE_SIZE); pmap_kenter(kva, VM_PAGE_TO_PHYS(m)); return (kva); } void vm_pager_unmap_page(kva) vm_offset_t kva; { pmap_kremove(kva); kmem_free_wakeup(pager_map, kva, PAGE_SIZE); } vm_object_t vm_pager_object_lookup(pg_list, handle) register struct pagerlst *pg_list; void *handle; { register vm_object_t object; for (object = TAILQ_FIRST(pg_list); object != NULL; object = TAILQ_NEXT(object,pager_object_list)) if (object->handle == handle) return (object); return (NULL); } /* * initialize a physical buffer */ static void initpbuf(struct buf *bp) { bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_data = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva; bp->b_kvabase = bp->b_data; bp->b_kvasize = MAXPHYS; bp->b_xflags = 0; bp->b_flags = 0; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE); } /* * allocate a physical buffer * * There are a limited number (nswbuf) of physical buffers. We need * to make sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A * subsystem may initialize the counter to -1 to disable the feature, * but it must still be sure to match up all uses of getpbuf() with * relpbuf() using the same variable. * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ struct buf * getpbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); +retry: if (pfreecnt) { while (*pfreecnt == 0) { tsleep(pfreecnt, PVM, "wswbuf0", 0); } } /* get a bp from the swap buffer header pool */ while ((bp = TAILQ_FIRST(&bswlist)) == NULL) { bswneeded = 1; tsleep(&bswneeded, PVM, "wswbuf1", 0); + goto retry; /* loop in case someone else grabbed one */ } TAILQ_REMOVE(&bswlist, bp, b_freelist); if (pfreecnt) --*pfreecnt; splx(s); initpbuf(bp); return bp; } /* * allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this * call understand how to use pfreecnt. */ struct buf * trypbuf(pfreecnt) int *pfreecnt; { int s; struct buf *bp; s = splvm(); if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) { splx(s); return NULL; } TAILQ_REMOVE(&bswlist, bp, b_freelist); --*pfreecnt; splx(s); initpbuf(bp); return bp; } /* * release a physical buffer * * NOTE: pfreecnt can be NULL, but this 'feature' will be removed * relatively soon when the rest of the subsystems get smart about it. XXX */ void relpbuf(bp, pfreecnt) struct buf *bp; int *pfreecnt; { int s; s = splvm(); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (bp->b_vp) pbrelvp(bp); BUF_UNLOCK(bp); TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist); if (bswneeded) { bswneeded = 0; wakeup(&bswneeded); } if (pfreecnt) { if (++*pfreecnt == 1) wakeup(pfreecnt); } splx(s); } /******************************************************** * CHAINING FUNCTIONS * ******************************************************** * * These functions support recursion of I/O operations * on bp's, typically by chaining one or more 'child' bp's * to the parent. Synchronous, asynchronous, and semi-synchronous * chaining is possible. */ /* * vm_pager_chain_iodone: * * io completion routine for child bp. Currently we fudge a bit * on dealing with b_resid. Since users of these routines may issue * multiple children simultaniously, sequencing of the error can be lost. */ static void vm_pager_chain_iodone(struct buf *nbp) { struct buf *bp; if ((bp = nbp->b_chain.parent) != NULL) { if (nbp->b_flags & B_ERROR) { bp->b_flags |= B_ERROR; bp->b_error = nbp->b_error; } else if (nbp->b_resid != 0) { bp->b_flags |= B_ERROR; bp->b_error = EINVAL; } else { bp->b_resid -= nbp->b_bcount; } nbp->b_chain.parent = NULL; --bp->b_chain.count; if (bp->b_flags & B_WANT) { bp->b_flags &= ~B_WANT; wakeup(bp); } if (!bp->b_chain.count && (bp->b_flags & B_AUTOCHAINDONE)) { bp->b_flags &= ~B_AUTOCHAINDONE; if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) { bp->b_flags |= B_ERROR; bp->b_error = EINVAL; } biodone(bp); } } nbp->b_flags |= B_DONE; nbp->b_flags &= ~B_ASYNC; relpbuf(nbp, NULL); } /* * getchainbuf: * * Obtain a physical buffer and chain it to its parent buffer. When * I/O completes, the parent buffer will be B_SIGNAL'd. Errors are * automatically propogated to the parent * * Since these are brand new buffers, we do not have to clear B_INVAL * and B_ERROR because they are already clear. */ struct buf * getchainbuf(struct buf *bp, struct vnode *vp, int flags) { struct buf *nbp = getpbuf(NULL); nbp->b_chain.parent = bp; ++bp->b_chain.count; if (bp->b_chain.count > 4) waitchainbuf(bp, 4, 0); nbp->b_flags = B_CALL | (bp->b_flags & B_ORDERED) | flags; nbp->b_rcred = nbp->b_wcred = proc0.p_ucred; nbp->b_iodone = vm_pager_chain_iodone; crhold(nbp->b_rcred); crhold(nbp->b_wcred); if (vp) pbgetvp(vp, nbp); return(nbp); } void flushchainbuf(struct buf *nbp) { if (nbp->b_bcount) { nbp->b_bufsize = nbp->b_bcount; if ((nbp->b_flags & B_READ) == 0) nbp->b_dirtyend = nbp->b_bcount; VOP_STRATEGY(nbp->b_vp, nbp); } else { biodone(nbp); } } void waitchainbuf(struct buf *bp, int count, int done) { int s; s = splbio(); while (bp->b_chain.count > count) { bp->b_flags |= B_WANT; tsleep(bp, PRIBIO + 4, "bpchain", 0); } if (done) { if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) { bp->b_flags |= B_ERROR; bp->b_error = EINVAL; } biodone(bp); } splx(s); } void autochaindone(struct buf *bp) { int s; s = splbio(); if (bp->b_chain.count == 0) biodone(bp); else bp->b_flags |= B_AUTOCHAINDONE; splx(s); }