Index: head/sys/alpha/alpha/trap.c =================================================================== --- head/sys/alpha/alpha/trap.c (revision 72375) +++ head/sys/alpha/alpha/trap.c (revision 72376) @@ -1,1153 +1,1152 @@ /* $FreeBSD$ */ /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */ /* * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* #include "opt_fix_unaligned_vax_fp.h" */ #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_simos.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef DDB #include #endif unsigned long Sfloat_to_reg __P((unsigned int)); unsigned int reg_to_Sfloat __P((unsigned long)); unsigned long Tfloat_reg_cvt __P((unsigned long)); #ifdef FIX_UNALIGNED_VAX_FP unsigned long Ffloat_to_reg __P((unsigned int)); unsigned int reg_to_Ffloat __P((unsigned long)); unsigned long Gfloat_reg_cvt __P((unsigned long)); #endif int unaligned_fixup __P((unsigned long, unsigned long, unsigned long, struct proc *)); static void printtrap __P((const unsigned long, const unsigned long, const unsigned long, const unsigned long, struct trapframe *, int, int)); #ifdef WITNESS extern char *syscallnames[]; #endif void alpha_clear_resched(void); void alpha_clear_resched(void) { clear_resched(); } /* * Define the code needed before returning to user mode, for * trap and syscall. */ void userret(p, frame, oticks) register struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig; /* take pending signals */ while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; if (resched_wanted()) { /* * Since we are curproc, a clock interrupt could * change our priority without changing run queues * (the running process is not kept on a run queue). * If this happened after we setrunqueue ourselves but * before we switch()'ed, we might not be on the queue * indicated by our priority. */ clear_resched(); DROP_GIANT_NOSWITCH(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); } /* * If profiling, charge recent system time to the trapped pc. */ if (p->p_sflag & PS_PROFIL) { mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, frame->tf_regs[FRAME_PC], (int)(p->p_sticks - oticks) * psratio); } - curpriority = p->p_priority; mtx_unlock_spin(&sched_lock); } static void printtrap(a0, a1, a2, entry, framep, isfatal, user) const unsigned long a0, a1, a2, entry; struct trapframe *framep; int isfatal, user; { char ubuf[64]; const char *entryname; switch (entry) { case ALPHA_KENTRY_INT: entryname = "interrupt"; break; case ALPHA_KENTRY_ARITH: entryname = "arithmetic trap"; break; case ALPHA_KENTRY_MM: entryname = "memory management fault"; break; case ALPHA_KENTRY_IF: entryname = "instruction fault"; break; case ALPHA_KENTRY_UNA: entryname = "unaligned access fault"; break; case ALPHA_KENTRY_SYS: entryname = "system call"; break; default: snprintf(ubuf, sizeof(ubuf), "type %lx", entry); entryname = (const char *) ubuf; break; } printf("\n"); printf("%s %s trap:\n", isfatal? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" trap entry = 0x%lx (%s)\n", entry, entryname); printf(" a0 = 0x%lx\n", a0); printf(" a1 = 0x%lx\n", a1); printf(" a2 = 0x%lx\n", a2); printf(" pc = 0x%lx\n", framep->tf_regs[FRAME_PC]); printf(" ra = 0x%lx\n", framep->tf_regs[FRAME_RA]); printf(" curproc = %p\n", curproc); if (curproc != NULL) printf(" pid = %d, comm = %s\n", curproc->p_pid, curproc->p_comm); printf("\n"); } /* * Trap is called from locore to handle most types of processor traps. * System calls are broken out for efficiency and ASTs are broken out * to make the code a bit cleaner and more representative of the * Alpha architecture. */ /*ARGSUSED*/ void trap(a0, a1, a2, entry, framep) const unsigned long a0, a1, a2, entry; struct trapframe *framep; { register struct proc *p; register int i; u_int64_t ucode; u_quad_t sticks; int user; /* * Find our per-cpu globals. */ globalp = (struct globaldata *) alpha_pal_rdval(); cnt.v_trap++; p = curproc; ucode = 0; user = (framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) != 0; if (user) { mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_tf = framep; #if 0 /* This is to catch some weird stuff on the UDB (mj) */ if (framep->tf_regs[FRAME_PC] > 0 && framep->tf_regs[FRAME_PC] < 0x120000000) { printf("PC Out of Whack\n"); printtrap(a0, a1, a2, entry, framep, 1, user); } #endif } else { sticks = 0; /* XXX bogus -Wuninitialized warning */ } #ifdef DIAGNOSTIC if (user) alpha_fpstate_check(p); #endif switch (entry) { case ALPHA_KENTRY_UNA: /* * If user-land, do whatever fixups, printing, and * signalling is appropriate (based on system-wide * and per-process unaligned-access-handling flags). */ if (user) { mtx_lock(&Giant); if ((i = unaligned_fixup(a0, a1, a2, p)) == 0) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); ucode = a0; /* VA */ break; } /* * Unaligned access from kernel mode is always an error, * EVEN IF A COPY FAULT HANDLER IS SET! * * It's an error if a copy fault handler is set because * the various routines which do user-initiated copies * do so in a bcopy-like manner. In other words, the * kernel never assumes that pointers provided by the * user are properly aligned, and so if the kernel * does cause an unaligned access it's a kernel bug. */ goto dopanic; case ALPHA_KENTRY_ARITH: /* * If user-land, give a SIGFPE if software completion * is not requested or if the completion fails. */ if (user) { mtx_lock(&Giant); if (a0 & EXCSUM_SWC) if (fp_software_completion(a1, p)) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); i = SIGFPE; ucode = a0; /* exception summary */ break; } /* Always fatal in kernel. Should never happen. */ goto dopanic; case ALPHA_KENTRY_IF: /* * These are always fatal in kernel, and should never * happen. */ if (!user) { #ifdef DDB /* * ...unless, of course, DDB is configured; BUGCHK * is used to invoke the kernel debugger, and we * might have set a breakpoint. */ if (a0 == ALPHA_IF_CODE_BUGCHK || a0 == ALPHA_IF_CODE_BPT #ifdef SIMOS || a0 == ALPHA_IF_CODE_GENTRAP #endif ) { if (kdb_trap(a0, a1, a2, entry, framep)) goto out; } /* * If we get here, DDB did _not_ handle the * trap, and we need to PANIC! */ #endif goto dopanic; } i = 0; switch (a0) { case ALPHA_IF_CODE_GENTRAP: if (framep->tf_regs[FRAME_A0] == -2) { /* weird! */ i = SIGFPE; ucode = a0; /* exception summary */ break; } /* FALLTHROUTH */ case ALPHA_IF_CODE_BPT: case ALPHA_IF_CODE_BUGCHK: if (p->p_md.md_flags & (MDP_STEP1|MDP_STEP2)) { ptrace_clear_single_step(p); p->p_md.md_tf->tf_regs[FRAME_PC] -= 4; } ucode = a0; /* trap type */ i = SIGTRAP; break; case ALPHA_IF_CODE_OPDEC: ucode = a0; /* trap type */ i = SIGILL; break; case ALPHA_IF_CODE_FEN: /* * on exit from the kernel, if proc == fpcurproc, * FP is enabled. */ if (PCPU_GET(fpcurproc) == p) { printf("trap: fp disabled for fpcurproc == %p", p); goto dopanic; } alpha_fpstate_switch(p); goto out; default: printf("trap: unknown IF type 0x%lx\n", a0); goto dopanic; } break; case ALPHA_KENTRY_MM: switch (a1) { case ALPHA_MMCSR_FOR: case ALPHA_MMCSR_FOE: pmap_emulate_reference(p, a0, user, 0); goto out; case ALPHA_MMCSR_FOW: pmap_emulate_reference(p, a0, user, 1); goto out; case ALPHA_MMCSR_INVALTRANS: case ALPHA_MMCSR_ACCESS: { register vm_offset_t va; register struct vmspace *vm = NULL; register vm_map_t map; vm_prot_t ftype = 0; int rv; /* * If it was caused by fuswintr or suswintr, * just punt. Note that we check the faulting * address against the address accessed by * [fs]uswintr, in case another fault happens * when they are running. */ if (!user && p != NULL && p->p_addr->u_pcb.pcb_onfault == (unsigned long)fswintrberr && p->p_addr->u_pcb.pcb_accessaddr == a0) { framep->tf_regs[FRAME_PC] = p->p_addr->u_pcb.pcb_onfault; p->p_addr->u_pcb.pcb_onfault = 0; goto out; } mtx_lock(&Giant); /* * It is only a kernel address space fault iff: * 1. !user and * 2. pcb_onfault not set or * 3. pcb_onfault set but kernel space data fault * The last can occur during an exec() copyin where the * argument space is lazy-allocated. * * For the purposes of the Linux emulator, we allow * kernel accesses to a small region of the * user stack which the emulator uses to * translate syscall arguments. */ if (!user && ((a0 >= VM_MIN_KERNEL_ADDRESS) || (p == NULL) || (p->p_addr->u_pcb.pcb_onfault == 0))) { if (a0 >= trunc_page(PS_STRINGS - szsigcode - SPARE_USRSPACE) && a0 < round_page(PS_STRINGS - szsigcode)) { vm = p->p_vmspace; map = &vm->vm_map; } else { map = kernel_map; } } else { vm = p->p_vmspace; map = &vm->vm_map; } switch (a2) { case -1: /* instruction fetch fault */ case 0: /* load instruction */ ftype = VM_PROT_READ; break; case 1: /* store instruction */ ftype = VM_PROT_WRITE; break; #ifdef DIAGNOSTIC default: /* XXX gcc -Wuninitialized */ goto dopanic; #endif } va = trunc_page((vm_offset_t)a0); if (map != kernel_map) { /* * Keep swapout from messing with us * during thiscritical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process * locking or stacks in the kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } nogo:; /* * If this was a stack access we keep track of the * maximum accessed stack size. Also, if vm_fault * gets a protection failure it is due to accessing * the stack region outside the current limit and * we need to reflect that as an access error. */ if (map != kernel_map && (caddr_t)va >= vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (rv == KERN_SUCCESS) { unsigned nss; nss = alpha_btop(round_page(USRSTACK - va)); if (nss > vm->vm_ssize) vm->vm_ssize = nss; } else if (rv == KERN_PROTECTION_FAILURE) rv = KERN_INVALID_ADDRESS; } if (rv == KERN_SUCCESS) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); if (!user) { /* Check for copyin/copyout fault */ if (p != NULL && p->p_addr->u_pcb.pcb_onfault != 0) { framep->tf_regs[FRAME_PC] = p->p_addr->u_pcb.pcb_onfault; p->p_addr->u_pcb.pcb_onfault = 0; goto out; } goto dopanic; } ucode = a0; i = SIGSEGV; #ifdef DEBUG printtrap(a0, a1, a2, entry, framep, 1, user); #endif break; } default: printf("trap: unknown MMCSR value 0x%lx\n", a1); goto dopanic; } break; default: goto dopanic; } #ifdef DEBUG printtrap(a0, a1, a2, entry, framep, 1, user); #endif framep->tf_regs[FRAME_TRAPARG_A0] = a0; framep->tf_regs[FRAME_TRAPARG_A1] = a1; framep->tf_regs[FRAME_TRAPARG_A2] = a2; trapsignal(p, i, ucode); out: if (user) { framep->tf_regs[FRAME_SP] = alpha_pal_rdusp(); userret(p, framep, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } return; dopanic: printtrap(a0, a1, a2, entry, framep, 1, user); /* XXX dump registers */ #ifdef DDB kdb_trap(a0, a1, a2, entry, framep); #endif panic("trap"); } /* * Process a system call. * * System calls are strange beasts. They are passed the syscall number * in v0, and the arguments in the registers (as normal). They return * an error flag in a3 (if a3 != 0 on return, the syscall had an error), * and the return value (if any) in v0. * * The assembly stub takes care of moving the call number into a register * we can get to, and moves all of the argument registers into their places * in the trap frame. On return, it restores the callee-saved registers, * a3, and v0 from the frame before returning to the user process. */ void syscall(code, framep) u_int64_t code; struct trapframe *framep; { struct sysent *callp; struct proc *p; int error = 0; u_int64_t opc; u_quad_t sticks; u_int64_t args[10]; /* XXX */ u_int hidden = 0, nargs; /* * Find our per-cpu globals. */ globalp = (struct globaldata *) alpha_pal_rdval(); mtx_lock(&Giant); framep->tf_regs[FRAME_TRAPARG_A0] = 0; framep->tf_regs[FRAME_TRAPARG_A1] = 0; framep->tf_regs[FRAME_TRAPARG_A2] = 0; #if notdef /* can't happen, ever. */ if ((framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) == 0) panic("syscall"); #endif cnt.v_syscall++; p = curproc; p->p_md.md_tf = framep; opc = framep->tf_regs[FRAME_PC] - 4; mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); #ifdef DIAGNOSTIC alpha_fpstate_check(p); #endif if (p->p_sysent->sv_prepsyscall) { /* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, ¶ms); */ panic("prepsyscall"); } else { /* * syscall() and __syscall() are handled the same on * the alpha, as everything is 64-bit aligned, anyway. */ if (code == SYS_syscall || code == SYS___syscall) { /* * Code is first argument, followed by actual args. */ code = framep->tf_regs[FRAME_A0]; hidden = 1; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; nargs = (callp->sy_narg & SYF_ARGMASK) + hidden; switch (nargs) { default: if (nargs > 10) /* XXX */ panic("syscall: too many args (%d)", nargs); error = copyin((caddr_t)(alpha_pal_rdusp()), &args[6], (nargs - 6) * sizeof(u_int64_t)); case 6: args[5] = framep->tf_regs[FRAME_A5]; case 5: args[4] = framep->tf_regs[FRAME_A4]; case 4: args[3] = framep->tf_regs[FRAME_A3]; case 3: args[2] = framep->tf_regs[FRAME_A2]; case 2: args[1] = framep->tf_regs[FRAME_A1]; case 1: args[0] = framep->tf_regs[FRAME_A0]; case 0: break; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, (callp->sy_narg & SYF_ARGMASK), args + hidden); #endif if (error == 0) { p->p_retval[0] = 0; p->p_retval[1] = 0; STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK)); error = (*callp->sy_call)(p, args + hidden); } switch (error) { case 0: framep->tf_regs[FRAME_V0] = p->p_retval[0]; framep->tf_regs[FRAME_A4] = p->p_retval[1]; framep->tf_regs[FRAME_A3] = 0; break; case ERESTART: framep->tf_regs[FRAME_PC] = opc; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } framep->tf_regs[FRAME_V0] = error; framep->tf_regs[FRAME_A3] = 1; break; } userret(p, framep, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, p->p_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); mtx_unlock(&Giant); #ifdef WITNESS if (witness_list(p)) { panic("system call %s returning with mutex(s) held\n", syscallnames[code]); } #endif mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } /* * Process an asynchronous software trap. * This is relatively easy. */ void ast(framep) struct trapframe *framep; { struct proc *p = CURPROC; u_quad_t sticks; KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); /* * We check for a pending AST here rather than in the assembly as * acquiring and releasing mutexes in assembly is not fun. */ mtx_lock_spin(&sched_lock); if (!(astpending() || resched_wanted())) { mtx_unlock_spin(&sched_lock); return; } sticks = p->p_sticks; p->p_md.md_tf = framep; astoff(); cnt.v_soft++; mtx_intr_enable(&sched_lock); if (p->p_sflag & PS_OWEUPC) { p->p_sflag &= ~PS_OWEUPC; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, p->p_stats->p_prof.pr_addr, p->p_stats->p_prof.pr_ticks); } if (p->p_sflag & PS_ALRMPEND) { p->p_sflag &= ~PS_ALRMPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGVTALRM); mtx_lock_spin(&sched_lock); } if (p->p_sflag & PS_PROFPEND) { p->p_sflag &= ~PS_PROFPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGPROF); } else mtx_unlock_spin(&sched_lock); userret(p, framep, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } /* * Unaligned access handler. It's not clear that this can get much slower... * */ const static int reg_to_framereg[32] = { FRAME_V0, FRAME_T0, FRAME_T1, FRAME_T2, FRAME_T3, FRAME_T4, FRAME_T5, FRAME_T6, FRAME_T7, FRAME_S0, FRAME_S1, FRAME_S2, FRAME_S3, FRAME_S4, FRAME_S5, FRAME_S6, FRAME_A0, FRAME_A1, FRAME_A2, FRAME_A3, FRAME_A4, FRAME_A5, FRAME_T8, FRAME_T9, FRAME_T10, FRAME_T11, FRAME_RA, FRAME_T12, FRAME_AT, FRAME_GP, FRAME_SP, -1, }; #define irp(p, reg) \ ((reg_to_framereg[(reg)] == -1) ? NULL : \ &(p)->p_md.md_tf->tf_regs[reg_to_framereg[(reg)]]) #define frp(p, reg) \ (&(p)->p_addr->u_pcb.pcb_fp.fpr_regs[(reg)]) #define unaligned_load(storage, ptrf, mod) \ if (copyin((caddr_t)va, &(storage), sizeof (storage)) == 0 && \ (regptr = ptrf(p, reg)) != NULL) \ signal = 0; \ else \ break; \ *regptr = mod (storage); #define unaligned_store(storage, ptrf, mod) \ if ((regptr = ptrf(p, reg)) == NULL) \ break; \ (storage) = mod (*regptr); \ if (copyout(&(storage), (caddr_t)va, sizeof (storage)) == 0) \ signal = 0; \ else \ break; #define unaligned_load_integer(storage) \ unaligned_load(storage, irp, ) #define unaligned_store_integer(storage) \ unaligned_store(storage, irp, ) #define unaligned_load_floating(storage, mod) \ alpha_fpstate_save(p, 1); \ unaligned_load(storage, frp, mod) #define unaligned_store_floating(storage, mod) \ alpha_fpstate_save(p, 0); \ unaligned_store(storage, frp, mod) unsigned long Sfloat_to_reg(s) unsigned int s; { unsigned long sign, expn, frac; unsigned long result; sign = (s & 0x80000000) >> 31; expn = (s & 0x7f800000) >> 23; frac = (s & 0x007fffff) >> 0; /* map exponent part, as appropriate. */ if (expn == 0xff) expn = 0x7ff; else if ((expn & 0x80) != 0) expn = (0x400 | (expn & ~0x80)); else if ((expn & 0x80) == 0 && expn != 0) expn = (0x380 | (expn & ~0x80)); result = (sign << 63) | (expn << 52) | (frac << 29); return (result); } unsigned int reg_to_Sfloat(r) unsigned long r; { unsigned long sign, expn, frac; unsigned int result; sign = (r & 0x8000000000000000) >> 63; expn = (r & 0x7ff0000000000000) >> 52; frac = (r & 0x000fffffe0000000) >> 29; /* map exponent part, as appropriate. */ expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00); result = (sign << 31) | (expn << 23) | (frac << 0); return (result); } /* * Conversion of T floating datums to and from register format * requires no bit reordering whatsoever. */ unsigned long Tfloat_reg_cvt(input) unsigned long input; { return (input); } #ifdef FIX_UNALIGNED_VAX_FP unsigned long Ffloat_to_reg(f) unsigned int f; { unsigned long sign, expn, frlo, frhi; unsigned long result; sign = (f & 0x00008000) >> 15; expn = (f & 0x00007f80) >> 7; frhi = (f & 0x0000007f) >> 0; frlo = (f & 0xffff0000) >> 16; /* map exponent part, as appropriate. */ if ((expn & 0x80) != 0) expn = (0x400 | (expn & ~0x80)); else if ((expn & 0x80) == 0 && expn != 0) expn = (0x380 | (expn & ~0x80)); result = (sign << 63) | (expn << 52) | (frhi << 45) | (frlo << 29); return (result); } unsigned int reg_to_Ffloat(r) unsigned long r; { unsigned long sign, expn, frhi, frlo; unsigned int result; sign = (r & 0x8000000000000000) >> 63; expn = (r & 0x7ff0000000000000) >> 52; frhi = (r & 0x000fe00000000000) >> 45; frlo = (r & 0x00001fffe0000000) >> 29; /* map exponent part, as appropriate. */ expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00); result = (sign << 15) | (expn << 7) | (frhi << 0) | (frlo << 16); return (result); } /* * Conversion of G floating datums to and from register format is * symmetrical. Just swap shorts in the quad... */ unsigned long Gfloat_reg_cvt(input) unsigned long input; { unsigned long a, b, c, d; unsigned long result; a = (input & 0x000000000000ffff) >> 0; b = (input & 0x00000000ffff0000) >> 16; c = (input & 0x0000ffff00000000) >> 32; d = (input & 0xffff000000000000) >> 48; result = (a << 48) | (b << 32) | (c << 16) | (d << 0); return (result); } #endif /* FIX_UNALIGNED_VAX_FP */ extern int alpha_unaligned_print, alpha_unaligned_fix; extern int alpha_unaligned_sigbus; int unaligned_fixup(va, opcode, reg, p) unsigned long va, opcode, reg; struct proc *p; { int doprint, dofix, dosigbus; int signal, size; const char *type; unsigned long *regptr, longdata, uac; int intdata; /* signed to get extension when storing */ struct { const char *type; /* opcode name */ int size; /* size, 0 if fixup not supported */ } tab[0x10] = { #ifdef FIX_UNALIGNED_VAX_FP { "ldf", 4 }, { "ldg", 8 }, #else { "ldf", 0 }, { "ldg", 0 }, #endif { "lds", 4 }, { "ldt", 8 }, #ifdef FIX_UNALIGNED_VAX_FP { "stf", 4 }, { "stg", 8 }, #else { "stf", 0 }, { "stg", 0 }, #endif { "sts", 4 }, { "stt", 8 }, { "ldl", 4 }, { "ldq", 8 }, { "ldl_l", 0 }, { "ldq_l", 0 }, /* can't fix */ { "stl", 4 }, { "stq", 8 }, { "stl_c", 0 }, { "stq_c", 0 }, /* can't fix */ }; /* * Figure out what actions to take. * */ if (p) uac = p->p_md.md_flags & MDP_UAC_MASK; else uac = 0; doprint = alpha_unaligned_print && !(uac & MDP_UAC_NOPRINT); dofix = alpha_unaligned_fix && !(uac & MDP_UAC_NOFIX); dosigbus = alpha_unaligned_sigbus | (uac & MDP_UAC_SIGBUS); /* * Find out which opcode it is. Arrange to have the opcode * printed if it's an unknown opcode. */ if (opcode >= 0x20 && opcode <= 0x2f) { type = tab[opcode - 0x20].type; size = tab[opcode - 0x20].size; } else { type = "0x%lx"; size = 0; } /* * See if the user can access the memory in question. * Even if it's an unknown opcode, SEGV if the access * should have failed. */ if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) { signal = SIGSEGV; goto out; } /* * If we're supposed to be noisy, squawk now. */ if (doprint) { uprintf( "pid %d (%s): unaligned access: va=0x%lx pc=0x%lx ra=0x%lx op=", p->p_pid, p->p_comm, va, p->p_md.md_tf->tf_regs[FRAME_PC], p->p_md.md_tf->tf_regs[FRAME_RA]); uprintf(type,opcode); uprintf("\n"); } /* * If we should try to fix it and know how, give it a shot. * * We never allow bad data to be unknowingly used by the * user process. That is, if we decide not to fix up an * access we cause a SIGBUS rather than letting the user * process go on without warning. * * If we're trying to do a fixup, we assume that things * will be botched. If everything works out OK, * unaligned_{load,store}_* clears the signal flag. */ signal = SIGBUS; if (dofix && size != 0) { switch (opcode) { #ifdef FIX_UNALIGNED_VAX_FP case 0x20: /* ldf */ unaligned_load_floating(intdata, Ffloat_to_reg); break; case 0x21: /* ldg */ unaligned_load_floating(longdata, Gfloat_reg_cvt); break; #endif case 0x22: /* lds */ unaligned_load_floating(intdata, Sfloat_to_reg); break; case 0x23: /* ldt */ unaligned_load_floating(longdata, Tfloat_reg_cvt); break; #ifdef FIX_UNALIGNED_VAX_FP case 0x24: /* stf */ unaligned_store_floating(intdata, reg_to_Ffloat); break; case 0x25: /* stg */ unaligned_store_floating(longdata, Gfloat_reg_cvt); break; #endif case 0x26: /* sts */ unaligned_store_floating(intdata, reg_to_Sfloat); break; case 0x27: /* stt */ unaligned_store_floating(longdata, Tfloat_reg_cvt); break; case 0x28: /* ldl */ unaligned_load_integer(intdata); break; case 0x29: /* ldq */ unaligned_load_integer(longdata); break; case 0x2c: /* stl */ unaligned_store_integer(intdata); break; case 0x2d: /* stq */ unaligned_store_integer(longdata); break; #ifdef DIAGNOSTIC default: panic("unaligned_fixup: can't get here"); #endif } } /* * Force SIGBUS if requested. */ if (dosigbus) signal = SIGBUS; out: return (signal); } Index: head/sys/amd64/amd64/cpu_switch.S =================================================================== --- head/sys/amd64/amd64/cpu_switch.S (revision 72375) +++ head/sys/amd64/amd64/cpu_switch.S (revision 72376) @@ -1,393 +1,379 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_npx.h" #include "opt_user_ldt.h" -#include - #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ .data .globl _panic #if defined(SWTCH_OPTIM_STATS) .globl _swtch_optim_stats, _tlb_flush_count _swtch_optim_stats: .long 0 /* number of _swtch_optims */ _tlb_flush_count: .long 0 #endif .text /* * cpu_throw() */ ENTRY(cpu_throw) jmp sw1 /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl PCPU(CURPROC),%ecx /* if no process to save, don't bother */ testl %ecx,%ecx jz sw1 -#ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ -#endif /* SMP */ + movl P_VMSPACE(%ecx), %edx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif /* SMP */ btrl %eax, VM_PMAP+PM_ACTIVE(%edx) movl P_ADDR(%ecx),%edx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%edx) movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) movl %esi,PCB_ESI(%edx) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl %dr7,%eax /* yes, do the save */ movl %eax,PCB_DR7(%edx) andl $0x0000ff00, %eax /* disable all watchpoints */ movl %eax,%dr7 movl %dr6,%eax movl %eax,PCB_DR6(%edx) movl %dr3,%eax movl %eax,PCB_DR3(%edx) movl %dr2,%eax movl %eax,PCB_DR2(%edx) movl %dr1,%eax movl %eax,PCB_DR1(%edx) movl %dr0,%eax movl %eax,PCB_DR0(%edx) 1: /* save sched_lock recursion count */ movl _sched_lock+MTX_RECURSECNT,%eax movl %eax,PCB_SCHEDNEST(%edx) #ifdef SMP /* XXX FIXME: we should be saving the local APIC TPR */ #endif /* SMP */ #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(NPXPROC) jne 1f addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ pushl %edx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* DEV_NPX */ /* save is done, now choose a new process */ sw1: #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,PCPU(CPUID) je 1f movl PCPU(IDLEPROC), %eax jmp sw1b 1: #endif /* * Choose a new process to schedule. chooseproc() returns idleproc * if it cannot find another process to run. */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ #ifdef INVARIANTS testl %eax,%eax /* no process? */ jz badsw3 /* no, panic */ #endif sw1b: movl %eax,%ecx #ifdef INVARIANTS cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl P_ADDR(%ecx),%edx #if defined(SWTCH_OPTIM_STATS) incl _swtch_optim_stats #endif /* switch address space */ movl %cr3,%ebx cmpl PCB_CR3(%edx),%ebx je 4f #if defined(SWTCH_OPTIM_STATS) decl _swtch_optim_stats incl _tlb_flush_count #endif movl PCB_CR3(%edx),%ebx movl %ebx,%cr3 4: -#ifdef SMP movl PCPU(CPUID), %esi -#else - xorl %esi, %esi -#endif cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: /* update common_tss.tss_esp0 pointer */ movl %edx, %ebx /* pcb */ addl $(UPAGES * PAGE_SIZE - 16), %ebx movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 btrl %esi, _private_tss jae 3f PCPU_ADDR(COMMON_TSSD, %edi) 2: /* move correct tss descriptor into GDT slot, then reload tr */ movl PCPU(TSS_GDT), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: movl P_VMSPACE(%ecx), %ebx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, _lapic+LA_TPR #else andl $~APIC_TPR_PRIO, _lapic+LA_TPR #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ +#endif /* SMP */ movl PCPU(CPUID),%eax movb %al, P_ONCPU(%ecx) -#endif /* SMP */ + movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURPROC) /* into next process */ #ifdef SMP /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f lldt __default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs /* test if debug regisers should be restored */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl PCB_DR6(%edx),%eax /* yes, do the restore */ movl %eax,%dr6 movl PCB_DR3(%edx),%eax movl %eax,%dr3 movl PCB_DR2(%edx),%eax movl %eax,%dr2 movl PCB_DR1(%edx),%eax movl %eax,%dr1 movl PCB_DR0(%edx),%eax movl %eax,%dr0 movl PCB_DR7(%edx),%eax movl %eax,%dr7 1: /* * restore sched_lock recursion count and transfer ownership to * new process */ movl PCB_SCHEDNEST(%edx),%eax movl %eax,_sched_lock+MTX_RECURSECNT movl PCPU(CURPROC),%eax movl %eax,_sched_lock+MTX_LOCK ret CROSSJUMPTARGET(sw1a) #ifdef INVARIANTS badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" badsw3: pushl $sw0_3 call _panic sw0_3: .asciz "cpu_switch: chooseproc returned NULL" #endif /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %cr3,%eax movl %eax,PCB_CR3(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %gs,PCB_GS(%ecx) #ifdef DEV_NPX /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl PCPU(NPXPROC),%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* DEV_NPX */ 1: ret Index: head/sys/amd64/amd64/genassym.c =================================================================== --- head/sys/amd64/amd64/genassym.c (revision 72375) +++ head/sys/amd64/amd64/genassym.c (revision 72376) @@ -1,222 +1,220 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD$ */ #include "opt_user_ldt.h" #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(P_STAT, offsetof(struct proc, p_stat)); ASSYM(P_WCHAN, offsetof(struct proc, p_wchan)); ASSYM(PS_ASTPENDING, PS_ASTPENDING); ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED); -#ifdef SMP ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu)); ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu)); -#endif ASSYM(SSLEEP, SSLEEP); ASSYM(SRUN, SRUN); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(UPAGES, UPAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(PDESIZE, PDESIZE); ASSYM(PTESIZE, PTESIZE); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); #ifdef USER_LDT ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt)); #endif ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP ASSYM(PCB_SIZE, sizeof(struct pcb)); #endif ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); #ifdef USER_LDT ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); #endif ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check)); /* XXX */ #ifdef KTR_PERCPU ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); #endif -#ifdef SMP ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); +#ifdef SMP ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); ASSYM(MTX_SAVEINTR, offsetof(struct mtx, mtx_saveintr)); Index: head/sys/amd64/amd64/swtch.s =================================================================== --- head/sys/amd64/amd64/swtch.s (revision 72375) +++ head/sys/amd64/amd64/swtch.s (revision 72376) @@ -1,393 +1,379 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_npx.h" #include "opt_user_ldt.h" -#include - #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ .data .globl _panic #if defined(SWTCH_OPTIM_STATS) .globl _swtch_optim_stats, _tlb_flush_count _swtch_optim_stats: .long 0 /* number of _swtch_optims */ _tlb_flush_count: .long 0 #endif .text /* * cpu_throw() */ ENTRY(cpu_throw) jmp sw1 /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl PCPU(CURPROC),%ecx /* if no process to save, don't bother */ testl %ecx,%ecx jz sw1 -#ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ -#endif /* SMP */ + movl P_VMSPACE(%ecx), %edx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif /* SMP */ btrl %eax, VM_PMAP+PM_ACTIVE(%edx) movl P_ADDR(%ecx),%edx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%edx) movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) movl %esi,PCB_ESI(%edx) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl %dr7,%eax /* yes, do the save */ movl %eax,PCB_DR7(%edx) andl $0x0000ff00, %eax /* disable all watchpoints */ movl %eax,%dr7 movl %dr6,%eax movl %eax,PCB_DR6(%edx) movl %dr3,%eax movl %eax,PCB_DR3(%edx) movl %dr2,%eax movl %eax,PCB_DR2(%edx) movl %dr1,%eax movl %eax,PCB_DR1(%edx) movl %dr0,%eax movl %eax,PCB_DR0(%edx) 1: /* save sched_lock recursion count */ movl _sched_lock+MTX_RECURSECNT,%eax movl %eax,PCB_SCHEDNEST(%edx) #ifdef SMP /* XXX FIXME: we should be saving the local APIC TPR */ #endif /* SMP */ #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(NPXPROC) jne 1f addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ pushl %edx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* DEV_NPX */ /* save is done, now choose a new process */ sw1: #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,PCPU(CPUID) je 1f movl PCPU(IDLEPROC), %eax jmp sw1b 1: #endif /* * Choose a new process to schedule. chooseproc() returns idleproc * if it cannot find another process to run. */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ #ifdef INVARIANTS testl %eax,%eax /* no process? */ jz badsw3 /* no, panic */ #endif sw1b: movl %eax,%ecx #ifdef INVARIANTS cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl P_ADDR(%ecx),%edx #if defined(SWTCH_OPTIM_STATS) incl _swtch_optim_stats #endif /* switch address space */ movl %cr3,%ebx cmpl PCB_CR3(%edx),%ebx je 4f #if defined(SWTCH_OPTIM_STATS) decl _swtch_optim_stats incl _tlb_flush_count #endif movl PCB_CR3(%edx),%ebx movl %ebx,%cr3 4: -#ifdef SMP movl PCPU(CPUID), %esi -#else - xorl %esi, %esi -#endif cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: /* update common_tss.tss_esp0 pointer */ movl %edx, %ebx /* pcb */ addl $(UPAGES * PAGE_SIZE - 16), %ebx movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 btrl %esi, _private_tss jae 3f PCPU_ADDR(COMMON_TSSD, %edi) 2: /* move correct tss descriptor into GDT slot, then reload tr */ movl PCPU(TSS_GDT), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: movl P_VMSPACE(%ecx), %ebx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, _lapic+LA_TPR #else andl $~APIC_TPR_PRIO, _lapic+LA_TPR #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ +#endif /* SMP */ movl PCPU(CPUID),%eax movb %al, P_ONCPU(%ecx) -#endif /* SMP */ + movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURPROC) /* into next process */ #ifdef SMP /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f lldt __default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs /* test if debug regisers should be restored */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl PCB_DR6(%edx),%eax /* yes, do the restore */ movl %eax,%dr6 movl PCB_DR3(%edx),%eax movl %eax,%dr3 movl PCB_DR2(%edx),%eax movl %eax,%dr2 movl PCB_DR1(%edx),%eax movl %eax,%dr1 movl PCB_DR0(%edx),%eax movl %eax,%dr0 movl PCB_DR7(%edx),%eax movl %eax,%dr7 1: /* * restore sched_lock recursion count and transfer ownership to * new process */ movl PCB_SCHEDNEST(%edx),%eax movl %eax,_sched_lock+MTX_RECURSECNT movl PCPU(CURPROC),%eax movl %eax,_sched_lock+MTX_LOCK ret CROSSJUMPTARGET(sw1a) #ifdef INVARIANTS badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" badsw3: pushl $sw0_3 call _panic sw0_3: .asciz "cpu_switch: chooseproc returned NULL" #endif /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %cr3,%eax movl %eax,PCB_CR3(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %gs,PCB_GS(%ecx) #ifdef DEV_NPX /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl PCPU(NPXPROC),%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* DEV_NPX */ 1: ret Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c (revision 72375) +++ head/sys/amd64/amd64/trap.c (revision 72376) @@ -1,1328 +1,1327 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD$ */ /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_isa.h" #include "opt_ktrace.h" #include "opt_npx.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #include #include #include int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif #ifdef DDB static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig; while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; if (resched_wanted()) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ clear_resched(); DROP_GIANT_NOSWITCH(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); } /* * Charge system time if profiling. */ if (p->p_sflag & PS_PROFIL) { mtx_unlock_spin(&sched_lock); /* XXX - do we need Giant? */ if (!mtx_owned(&Giant)) mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, TRAPF_PC(frame), (u_int)(p->p_sticks - oticks) * psratio); } - curpriority = p->p_priority; mtx_unlock_spin(&sched_lock); } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif atomic_add_int(&cnt.v_trap, 1); if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. XXX This is really bad if we trap * while holding a spin lock. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We should walk p_heldmtx here and see if any are * spin mutexes, and not do this if so. */ enable_intr(); } } eva = 0; #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif type = frame.tf_trapno; code = frame.tf_err; if ((ISPL(frame.tf_cs) == SEL_UPL) || ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i == 0) goto user; break; } /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); i = trap_pfault(&frame, TRUE, eva); mtx_unlock(&Giant); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * f00f hack workaround has triggered, treat * as illegal instruction not page fault. */ frame.tf_trapno = T_PRIVINFLT; goto restart; } #endif if (i == -1) goto out; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX /* transparent fault (due to context switch "late") */ if (npxdna()) goto out; #endif if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } mtx_lock(&Giant); i = (*pmath_emulate)(&frame); mtx_unlock(&Giant); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); (void) trap_pfault(&frame, FALSE, eva); mtx_unlock(&Giant); goto out; case T_DNA: #ifdef DEV_NPX /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) goto out; #endif break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)&frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (in_vm86call) break; if (p->p_intr_nesting_level != 0) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame.tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; mtx_lock(&Giant); psignal(p, SIGBUS); mtx_unlock(&Giant); goto out; } /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_eip == (int)doreti_iret) { frame.tf_eip = (int)doreti_iret_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_ds) { frame.tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_es) { frame.tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_fs) { frame.tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ /* XXX Giant */ if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB /* XXX Giant */ if (kdb_trap (type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } mtx_lock(&Giant); trap_fatal(&frame, eva); mtx_unlock(&Giant); goto out; } mtx_lock(&Giant); /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(p, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif mtx_unlock(&Giant); user: userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); out: return; } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; va = trunc_page(eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t mpte; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (p->p_intr_nesting_level != 0 || PCPU_GET(curpcb) == NULL || PCPU_GET(curpcb)->pcb_onfault == NULL))) { trap_fatal(frame, eva); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; #endif if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss, esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); if (!grow_stack (p, va)) { PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); return (1); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); if (rv != KERN_SUCCESS) return 1; return (0); } /* * syscall - MP aware system call request C handler * * A system call is essentially treated as a trap except that the * MP lock is not held on entry or return. We are responsible for * obtaining the MP lock if necessary and for handling ASTs * (e.g. a task switch) prior to return. * * In general, only simple access and manipulation of curproc and * the current stack is allowed without having to hold MP lock. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int narg; int args[8]; u_int code; atomic_add_int(&cnt.v_syscall, 1); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); panic("syscall"); /* NOT REACHED */ } #endif mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is not MP aware. */ mtx_lock(&Giant); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); mtx_unlock(&Giant); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin is MP aware, but the tracing code is not */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { mtx_lock(&Giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); #endif goto bad; } /* * Try to run the syscall without the MP lock if the syscall * is MP safe. We have to obtain the MP lock no matter what if * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { mtx_lock(&Giant); } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsyscall(p->p_tracep, code, narg, args); } #endif p->p_retval[0] = 0; p->p_retval[1] = frame.tf_edx; STOPEVENT(p, S_SCE, narg); /* MP aware */ error = (*callp->sy_call)(p, args); /* * MP SAFE (we may or may not have the MP lock at this point) */ switch (error) { case 0: frame.tf_eax = p->p_retval[0]; frame.tf_edx = p->p_retval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame.tf_eip -= frame.tf_err; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } /* * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); /* * Release Giant if we had to get it */ if (mtx_owned(&Giant)) mtx_unlock(&Giant); #ifdef WITNESS if (witness_list(p)) { panic("system call %s returning with mutex(s) held\n", syscallnames[code]); } #endif mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } void ast(frame) struct trapframe frame; { struct proc *p = CURPROC; u_quad_t sticks; KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode")); /* * We check for a pending AST here rather than in the assembly as * acquiring and releasing mutexes in assembly is not fun. */ mtx_lock_spin(&sched_lock); if (!(astpending() || resched_wanted())) { mtx_unlock_spin(&sched_lock); return; } sticks = p->p_sticks; astoff(); mtx_intr_enable(&sched_lock); atomic_add_int(&cnt.v_soft, 1); if (p->p_sflag & PS_OWEUPC) { p->p_sflag &= ~PS_OWEUPC; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, p->p_stats->p_prof.pr_addr, p->p_stats->p_prof.pr_ticks); } if (p->p_sflag & PS_ALRMPEND) { p->p_sflag &= ~PS_ALRMPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGVTALRM); mtx_lock_spin(&sched_lock); } if (p->p_sflag & PS_PROFPEND) { p->p_sflag &= ~PS_PROFPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGPROF); } else mtx_unlock_spin(&sched_lock); userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } Index: head/sys/dev/acpica/Osd/OsdSchedule.c =================================================================== --- head/sys/dev/acpica/Osd/OsdSchedule.c (revision 72375) +++ head/sys/dev/acpica/Osd/OsdSchedule.c (revision 72376) @@ -1,150 +1,150 @@ /*- * Copyright (c) 2000 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * 6.3 : Scheduling services */ #include "acpi.h" #include #include #include #include #define _COMPONENT OS_DEPENDENT MODULE_NAME("SCHEDULE") /* * This is a little complicated due to the fact that we need to build and then * free a 'struct task' for each task we enqueue. * * We use the default taskqueue_swi queue, since it really doesn't matter what * else we're queued along with. */ MALLOC_DEFINE(M_ACPITASK, "acpitask", "ACPI deferred task"); static void AcpiOsExecuteQueue(void *arg, int pending); struct acpi_task { struct task at_task; OSD_EXECUTION_CALLBACK at_function; void *at_context; }; ACPI_STATUS AcpiOsQueueForExecution(UINT32 Priority, OSD_EXECUTION_CALLBACK Function, void *Context) { struct acpi_task *at; FUNCTION_TRACE(__FUNCTION__); if (Function == NULL) return_ACPI_STATUS(AE_BAD_PARAMETER); at = malloc(sizeof(*at), M_ACPITASK, M_NOWAIT); /* Interrupt Context */ if (at == NULL) return_ACPI_STATUS(AE_NO_MEMORY); bzero(at, sizeof(*at)); at->at_function = Function; at->at_context = Context; at->at_task.ta_func = AcpiOsExecuteQueue; at->at_task.ta_context = at; switch (Priority) { case OSD_PRIORITY_GPE: at->at_task.ta_priority = 4; break; case OSD_PRIORITY_HIGH: at->at_task.ta_priority = 3; break; case OSD_PRIORITY_MED: at->at_task.ta_priority = 2; break; case OSD_PRIORITY_LO: at->at_task.ta_priority = 1; break; default: free(at, M_ACPITASK); return_ACPI_STATUS(AE_BAD_PARAMETER); } taskqueue_enqueue(taskqueue_swi, (struct task *)at); return_ACPI_STATUS(AE_OK); } static void AcpiOsExecuteQueue(void *arg, int pending) { struct acpi_task *at = (struct acpi_task *)arg; OSD_EXECUTION_CALLBACK Function; void *Context; FUNCTION_TRACE(__FUNCTION__); Function = (OSD_EXECUTION_CALLBACK)at->at_function; Context = at->at_context; free(at, M_ACPITASK); Function(Context); return_VOID; } /* * We don't have any sleep granularity better than hz, so * make do with that. */ void AcpiOsSleep (UINT32 Seconds, UINT32 Milliseconds) { int timo; FUNCTION_TRACE(__FUNCTION__); timo = (Seconds * hz) + Milliseconds / (1000 * hz); if (timo == 0) timo = 1; - tsleep(NULL, 0, "acpislp", timo); + tsleep(NULL, PZERO, "acpislp", timo); return_VOID; } void AcpiOsSleepUsec (UINT32 Microseconds) { FUNCTION_TRACE(__FUNCTION__); if (Microseconds > 1000) { /* long enough to be worth the overhead of sleeping */ AcpiOsSleep(0, Microseconds / 1000); } else { DELAY(Microseconds); } return_VOID; } Index: head/sys/i386/i386/genassym.c =================================================================== --- head/sys/i386/i386/genassym.c (revision 72375) +++ head/sys/i386/i386/genassym.c (revision 72376) @@ -1,222 +1,220 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 * $FreeBSD$ */ #include "opt_user_ldt.h" #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #ifdef KTR_PERCPU #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level)); ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(P_STAT, offsetof(struct proc, p_stat)); ASSYM(P_WCHAN, offsetof(struct proc, p_wchan)); ASSYM(PS_ASTPENDING, PS_ASTPENDING); ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED); -#ifdef SMP ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu)); ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu)); -#endif ASSYM(SSLEEP, SSLEEP); ASSYM(SRUN, SRUN); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(UPAGES, UPAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(PDESIZE, PDESIZE); ASSYM(PTESIZE, PTESIZE); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); #ifdef USER_LDT ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt)); #endif ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); #ifdef SMP ASSYM(PCB_SIZE, sizeof(struct pcb)); #endif ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); #ifdef USER_LDT ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); #endif ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check)); /* XXX */ #ifdef KTR_PERCPU ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); #endif -#ifdef SMP ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); +#ifdef SMP ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse)); ASSYM(MTX_SAVEINTR, offsetof(struct mtx, mtx_saveintr)); Index: head/sys/i386/i386/swtch.s =================================================================== --- head/sys/i386/i386/swtch.s (revision 72375) +++ head/sys/i386/i386/swtch.s (revision 72376) @@ -1,393 +1,379 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_npx.h" #include "opt_user_ldt.h" -#include - #include #include #ifdef SMP #include #include #include /** GRAB_LOPRIO */ #endif /* SMP */ #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ .data .globl _panic #if defined(SWTCH_OPTIM_STATS) .globl _swtch_optim_stats, _tlb_flush_count _swtch_optim_stats: .long 0 /* number of _swtch_optims */ _tlb_flush_count: .long 0 #endif .text /* * cpu_throw() */ ENTRY(cpu_throw) jmp sw1 /* * cpu_switch() */ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl PCPU(CURPROC),%ecx /* if no process to save, don't bother */ testl %ecx,%ecx jz sw1 -#ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ movb %al, P_LASTCPU(%ecx) movb $0xff, P_ONCPU(%ecx) /* "leave" the cpu */ -#endif /* SMP */ + movl P_VMSPACE(%ecx), %edx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif /* SMP */ btrl %eax, VM_PMAP+PM_ACTIVE(%edx) movl P_ADDR(%ecx),%edx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%edx) movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) movl %esi,PCB_ESI(%edx) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl %dr7,%eax /* yes, do the save */ movl %eax,PCB_DR7(%edx) andl $0x0000ff00, %eax /* disable all watchpoints */ movl %eax,%dr7 movl %dr6,%eax movl %eax,PCB_DR6(%edx) movl %dr3,%eax movl %eax,PCB_DR3(%edx) movl %dr2,%eax movl %eax,PCB_DR2(%edx) movl %dr1,%eax movl %eax,PCB_DR1(%edx) movl %dr0,%eax movl %eax,PCB_DR0(%edx) 1: /* save sched_lock recursion count */ movl _sched_lock+MTX_RECURSECNT,%eax movl %eax,PCB_SCHEDNEST(%edx) #ifdef SMP /* XXX FIXME: we should be saving the local APIC TPR */ #endif /* SMP */ #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(NPXPROC) jne 1f addl $PCB_SAVEFPU,%edx /* h/w bugs make saving complicated */ pushl %edx call _npxsave /* do it in a big C function */ popl %eax 1: #endif /* DEV_NPX */ /* save is done, now choose a new process */ sw1: #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,PCPU(CPUID) je 1f movl PCPU(IDLEPROC), %eax jmp sw1b 1: #endif /* * Choose a new process to schedule. chooseproc() returns idleproc * if it cannot find another process to run. */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ #ifdef INVARIANTS testl %eax,%eax /* no process? */ jz badsw3 /* no, panic */ #endif sw1b: movl %eax,%ecx #ifdef INVARIANTS cmpb $SRUN,P_STAT(%ecx) jne badsw2 #endif movl P_ADDR(%ecx),%edx #if defined(SWTCH_OPTIM_STATS) incl _swtch_optim_stats #endif /* switch address space */ movl %cr3,%ebx cmpl PCB_CR3(%edx),%ebx je 4f #if defined(SWTCH_OPTIM_STATS) decl _swtch_optim_stats incl _tlb_flush_count #endif movl PCB_CR3(%edx),%ebx movl %ebx,%cr3 4: -#ifdef SMP movl PCPU(CPUID), %esi -#else - xorl %esi, %esi -#endif cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f btsl %esi, _private_tss /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f 1: /* update common_tss.tss_esp0 pointer */ movl %edx, %ebx /* pcb */ addl $(UPAGES * PAGE_SIZE - 16), %ebx movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 btrl %esi, _private_tss jae 3f PCPU_ADDR(COMMON_TSSD, %edi) 2: /* move correct tss descriptor into GDT slot, then reload tr */ movl PCPU(TSS_GDT), %ebx /* entry in GDT */ movl 0(%edi), %eax movl %eax, 0(%ebx) movl 4(%edi), %eax movl %eax, 4(%ebx) movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: movl P_VMSPACE(%ecx), %ebx -#ifdef SMP movl PCPU(CPUID), %eax -#else - xorl %eax, %eax -#endif btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* restore context */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) #ifdef SMP #ifdef GRAB_LOPRIO /* hold LOPRIO for INTs */ #ifdef CHEAP_TPR movl $0, _lapic+LA_TPR #else andl $~APIC_TPR_PRIO, _lapic+LA_TPR #endif /** CHEAP_TPR */ #endif /** GRAB_LOPRIO */ +#endif /* SMP */ movl PCPU(CPUID),%eax movb %al, P_ONCPU(%ecx) -#endif /* SMP */ + movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURPROC) /* into next process */ #ifdef SMP /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ #ifdef USER_LDT cmpl $0, PCB_USERLDT(%edx) jnz 1f movl __default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f lldt __default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: pushl %edx call _set_user_ldt popl %edx 2: #endif /* This must be done after loading the user LDT. */ .globl cpu_switch_load_gs cpu_switch_load_gs: movl PCB_GS(%edx),%gs /* test if debug regisers should be restored */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ movl PCB_DR6(%edx),%eax /* yes, do the restore */ movl %eax,%dr6 movl PCB_DR3(%edx),%eax movl %eax,%dr3 movl PCB_DR2(%edx),%eax movl %eax,%dr2 movl PCB_DR1(%edx),%eax movl %eax,%dr1 movl PCB_DR0(%edx),%eax movl %eax,%dr0 movl PCB_DR7(%edx),%eax movl %eax,%dr7 1: /* * restore sched_lock recursion count and transfer ownership to * new process */ movl PCB_SCHEDNEST(%edx),%eax movl %eax,_sched_lock+MTX_RECURSECNT movl PCPU(CURPROC),%eax movl %eax,_sched_lock+MTX_LOCK ret CROSSJUMPTARGET(sw1a) #ifdef INVARIANTS badsw2: pushl $sw0_2 call _panic sw0_2: .asciz "cpu_switch: not SRUN" badsw3: pushl $sw0_3 call _panic sw0_3: .asciz "cpu_switch: chooseproc returned NULL" #endif /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* fetch PCB */ movl 4(%esp),%ecx /* caller's return address - child won't execute this routine */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %cr3,%eax movl %eax,PCB_CR3(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) movl %gs,PCB_GS(%ecx) #ifdef DEV_NPX /* * If npxproc == NULL, then the npx h/w state is irrelevant and the * state had better already be in the pcb. This is true for forks * but not for dumps (the old book-keeping with FP flags in the pcb * always lost for dumps because the dump pcb has 0 flags). * * If npxproc != NULL, then we have to save the npx h/w state to * npxproc's pcb and copy it to the requested pcb, or save to the * requested pcb and reload. Copying is easier because we would * have to handle h/w bugs for reloading. We used to lose the * parent's npx state for forks by forgetting to reload. */ movl PCPU(NPXPROC),%eax testl %eax,%eax je 1f pushl %ecx movl P_ADDR(%eax),%eax leal PCB_SAVEFPU(%eax),%eax pushl %eax pushl %eax call _npxsave addl $4,%esp popl %eax popl %ecx pushl $PCB_SAVEFPU_SIZE leal PCB_SAVEFPU(%ecx),%ecx pushl %ecx pushl %eax call _bcopy addl $12,%esp #endif /* DEV_NPX */ 1: ret Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 72375) +++ head/sys/i386/i386/trap.c (revision 72376) @@ -1,1328 +1,1327 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD$ */ /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_isa.h" #include "opt_ktrace.h" #include "opt_npx.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #include #include #include int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif #ifdef DDB static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig; while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; if (resched_wanted()) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ clear_resched(); DROP_GIANT_NOSWITCH(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); } /* * Charge system time if profiling. */ if (p->p_sflag & PS_PROFIL) { mtx_unlock_spin(&sched_lock); /* XXX - do we need Giant? */ if (!mtx_owned(&Giant)) mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, TRAPF_PC(frame), (u_int)(p->p_sticks - oticks) * psratio); } - curpriority = p->p_priority; mtx_unlock_spin(&sched_lock); } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif atomic_add_int(&cnt.v_trap, 1); if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. XXX This is really bad if we trap * while holding a spin lock. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We should walk p_heldmtx here and see if any are * spin mutexes, and not do this if so. */ enable_intr(); } } eva = 0; #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif type = frame.tf_trapno; code = frame.tf_err; if ((ISPL(frame.tf_cs) == SEL_UPL) || ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i == 0) goto user; break; } /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); i = trap_pfault(&frame, TRUE, eva); mtx_unlock(&Giant); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * f00f hack workaround has triggered, treat * as illegal instruction not page fault. */ frame.tf_trapno = T_PRIVINFLT; goto restart; } #endif if (i == -1) goto out; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX /* transparent fault (due to context switch "late") */ if (npxdna()) goto out; #endif if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } mtx_lock(&Giant); i = (*pmath_emulate)(&frame); mtx_unlock(&Giant); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); (void) trap_pfault(&frame, FALSE, eva); mtx_unlock(&Giant); goto out; case T_DNA: #ifdef DEV_NPX /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) goto out; #endif break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)&frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (in_vm86call) break; if (p->p_intr_nesting_level != 0) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame.tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; mtx_lock(&Giant); psignal(p, SIGBUS); mtx_unlock(&Giant); goto out; } /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_eip == (int)doreti_iret) { frame.tf_eip = (int)doreti_iret_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_ds) { frame.tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_es) { frame.tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_fs) { frame.tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ /* XXX Giant */ if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB /* XXX Giant */ if (kdb_trap (type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } mtx_lock(&Giant); trap_fatal(&frame, eva); mtx_unlock(&Giant); goto out; } mtx_lock(&Giant); /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(p, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif mtx_unlock(&Giant); user: userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); out: return; } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; va = trunc_page(eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t mpte; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (p->p_intr_nesting_level != 0 || PCPU_GET(curpcb) == NULL || PCPU_GET(curpcb)->pcb_onfault == NULL))) { trap_fatal(frame, eva); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; #endif if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss, esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); if (!grow_stack (p, va)) { PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); return (1); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); if (rv != KERN_SUCCESS) return 1; return (0); } /* * syscall - MP aware system call request C handler * * A system call is essentially treated as a trap except that the * MP lock is not held on entry or return. We are responsible for * obtaining the MP lock if necessary and for handling ASTs * (e.g. a task switch) prior to return. * * In general, only simple access and manipulation of curproc and * the current stack is allowed without having to hold MP lock. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int narg; int args[8]; u_int code; atomic_add_int(&cnt.v_syscall, 1); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); panic("syscall"); /* NOT REACHED */ } #endif mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is not MP aware. */ mtx_lock(&Giant); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); mtx_unlock(&Giant); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin is MP aware, but the tracing code is not */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { mtx_lock(&Giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); #endif goto bad; } /* * Try to run the syscall without the MP lock if the syscall * is MP safe. We have to obtain the MP lock no matter what if * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { mtx_lock(&Giant); } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsyscall(p->p_tracep, code, narg, args); } #endif p->p_retval[0] = 0; p->p_retval[1] = frame.tf_edx; STOPEVENT(p, S_SCE, narg); /* MP aware */ error = (*callp->sy_call)(p, args); /* * MP SAFE (we may or may not have the MP lock at this point) */ switch (error) { case 0: frame.tf_eax = p->p_retval[0]; frame.tf_edx = p->p_retval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame.tf_eip -= frame.tf_err; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } /* * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); /* * Release Giant if we had to get it */ if (mtx_owned(&Giant)) mtx_unlock(&Giant); #ifdef WITNESS if (witness_list(p)) { panic("system call %s returning with mutex(s) held\n", syscallnames[code]); } #endif mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } void ast(frame) struct trapframe frame; { struct proc *p = CURPROC; u_quad_t sticks; KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode")); /* * We check for a pending AST here rather than in the assembly as * acquiring and releasing mutexes in assembly is not fun. */ mtx_lock_spin(&sched_lock); if (!(astpending() || resched_wanted())) { mtx_unlock_spin(&sched_lock); return; } sticks = p->p_sticks; astoff(); mtx_intr_enable(&sched_lock); atomic_add_int(&cnt.v_soft, 1); if (p->p_sflag & PS_OWEUPC) { p->p_sflag &= ~PS_OWEUPC; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, p->p_stats->p_prof.pr_addr, p->p_stats->p_prof.pr_ticks); } if (p->p_sflag & PS_ALRMPEND) { p->p_sflag &= ~PS_ALRMPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGVTALRM); mtx_lock_spin(&sched_lock); } if (p->p_sflag & PS_PROFPEND) { p->p_sflag &= ~PS_PROFPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGPROF); } else mtx_unlock_spin(&sched_lock); userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } Index: head/sys/ia64/ia64/trap.c =================================================================== --- head/sys/ia64/ia64/trap.c (revision 72375) +++ head/sys/ia64/ia64/trap.c (revision 72376) @@ -1,782 +1,781 @@ /* $FreeBSD$ */ /* From: src/sys/alpha/alpha/trap.c,v 1.33 */ /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */ /* * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef DDB #include #endif u_int32_t want_resched; static int unaligned_fixup(struct trapframe *framep, struct proc *p); #ifdef WITNESS extern char *syscallnames[]; #endif /* * Define the code needed before returning to user mode, for * trap and syscall. */ void userret(register struct proc *p, struct trapframe *frame, u_quad_t oticks) { int sig, s; /* take pending signals */ while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; if (want_resched) { /* * Since we are curproc, a clock interrupt could * change our priority without changing run queues * (the running process is not kept on a run queue). * If this happened after we setrunqueue ourselves but * before we switch()'ed, we might not be on the queue * indicated by our priority. */ s = splstatclock(); DROP_GIANT_NOSWITCH(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); splx(s); while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); } /* * If profiling, charge recent system time to the trapped pc. */ if (p->p_sflag & PS_PROFIL) { mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, frame->tf_cr_iip, (int)(p->p_sticks - oticks) * psratio); } - curpriority = p->p_priority; mtx_unlock_spin(&sched_lock); } static const char *ia64_vector_names[] = { "VHPT Translation", /* 0 */ "Instruction TLB", /* 1 */ "Data TLB", /* 2 */ "Alternate Instruction TLB", /* 3 */ "Alternate Data TLB", /* 4 */ "Data Nested TLB", /* 5 */ "Instruction Key Miss", /* 6 */ "Data Key Miss", /* 7 */ "Dirty-Bit", /* 8 */ "Instruction Access-Bit", /* 9 */ "Data Access-Bit", /* 10 */ "Break Instruction", /* 11 */ "External Interrupt", /* 12 */ "Reserved 13", /* 13 */ "Reserved 14", /* 14 */ "Reserved 15", /* 15 */ "Reserved 16", /* 16 */ "Reserved 17", /* 17 */ "Reserved 18", /* 18 */ "Reserved 19", /* 19 */ "Page Not Present", /* 20 */ "Key Permission", /* 21 */ "Instruction Access Rights", /* 22 */ "Data Access Rights", /* 23 */ "General Exception", /* 24 */ "Disabled FP-Register", /* 25 */ "NaT Consumption", /* 26 */ "Speculation", /* 27 */ "Reserved 28", /* 28 */ "Debug", /* 29 */ "Unaligned Reference", /* 30 */ "Unsupported Data Reference", /* 31 */ "Floating-point Fault", /* 32 */ "Floating-point Trap", /* 33 */ "Lower-Privilege Transfer Trap", /* 34 */ "Taken Branch Trap", /* 35 */ "Single Step Trap", /* 36 */ "Reserved 37", /* 37 */ "Reserved 38", /* 38 */ "Reserved 39", /* 39 */ "Reserved 40", /* 40 */ "Reserved 41", /* 41 */ "Reserved 42", /* 42 */ "Reserved 43", /* 43 */ "Reserved 44", /* 44 */ "IA-32 Exception", /* 45 */ "IA-32 Intercept", /* 46 */ "IA-32 Interrupt", /* 47 */ "Reserved 48", /* 48 */ "Reserved 49", /* 49 */ "Reserved 50", /* 50 */ "Reserved 51", /* 51 */ "Reserved 52", /* 52 */ "Reserved 53", /* 53 */ "Reserved 54", /* 54 */ "Reserved 55", /* 55 */ "Reserved 56", /* 56 */ "Reserved 57", /* 57 */ "Reserved 58", /* 58 */ "Reserved 59", /* 59 */ "Reserved 60", /* 60 */ "Reserved 61", /* 61 */ "Reserved 62", /* 62 */ "Reserved 63", /* 63 */ "Reserved 64", /* 64 */ "Reserved 65", /* 65 */ "Reserved 66", /* 66 */ "Reserved 67", /* 67 */ }; static void printtrap(int vector, int imm, struct trapframe *framep, int isfatal, int user) { printf("\n"); printf("%s %s trap:\n", isfatal? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" trap vector = 0x%x (%s)\n", vector, ia64_vector_names[vector]); printf(" cr.iip = 0x%lx\n", framep->tf_cr_iip); printf(" cr.ipsr = 0x%lx\n", framep->tf_cr_ipsr); printf(" cr.isr = 0x%lx\n", framep->tf_cr_isr); printf(" cr.ifa = 0x%lx\n", framep->tf_cr_ifa); printf(" cr.iim = 0x%x\n", imm); printf(" curproc = %p\n", curproc); if (curproc != NULL) printf(" pid = %d, comm = %s\n", curproc->p_pid, curproc->p_comm); printf("\n"); } /* * Trap is called from exception.s to handle most types of processor traps. * System calls are broken out for efficiency and ASTs are broken out * to make the code a bit cleaner and more representative of the * architecture. */ /*ARGSUSED*/ void trap(int vector, int imm, struct trapframe *framep) { struct proc *p; int i; u_int64_t ucode; u_quad_t sticks; int user; cnt.v_trap++; p = curproc; ucode = 0; user = ((framep->tf_cr_ipsr & IA64_PSR_CPL) == IA64_PSR_CPL_USER); if (user) { mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_tf = framep; } else { sticks = 0; /* XXX bogus -Wuninitialized warning */ } switch (vector) { case IA64_VEC_UNALIGNED_REFERENCE: /* * If user-land, do whatever fixups, printing, and * signalling is appropriate (based on system-wide * and per-process unaligned-access-handling flags). */ if (user) { mtx_lock(&Giant); if ((i = unaligned_fixup(framep, p)) == 0) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); ucode = framep->tf_cr_ifa; /* VA */ break; } /* * Unaligned access from kernel mode is always an error, * EVEN IF A COPY FAULT HANDLER IS SET! * * It's an error if a copy fault handler is set because * the various routines which do user-initiated copies * do so in a bcopy-like manner. In other words, the * kernel never assumes that pointers provided by the * user are properly aligned, and so if the kernel * does cause an unaligned access it's a kernel bug. */ goto dopanic; case IA64_VEC_FLOATING_POINT_FAULT: case IA64_VEC_FLOATING_POINT_TRAP: /* * If user-land, give a SIGFPE if software completion * is not requested or if the completion fails. */ if (user) { i = SIGFPE; ucode = /*a0*/ 0; /* exception summary */ break; } /* Always fatal in kernel. Should never happen. */ goto dopanic; case IA64_VEC_BREAK: goto dopanic; case IA64_VEC_DISABLED_FP: /* * on exit from the kernel, if proc == fpcurproc, * FP is enabled. */ if (PCPU_GET(fpcurproc) == p) { printf("trap: fp disabled for fpcurproc == %p", p); goto dopanic; } ia64_fpstate_switch(p); goto out; break; case IA64_VEC_PAGE_NOT_PRESENT: case IA64_VEC_INST_ACCESS_RIGHTS: case IA64_VEC_DATA_ACCESS_RIGHTS: { vm_offset_t va = framep->tf_cr_ifa; struct vmspace *vm = NULL; vm_map_t map; vm_prot_t ftype = 0; int rv; mtx_lock(&Giant); /* * If it was caused by fuswintr or suswintr, * just punt. Note that we check the faulting * address against the address accessed by * [fs]uswintr, in case another fault happens * when they are running. */ if (!user && p != NULL && p->p_addr->u_pcb.pcb_onfault == (unsigned long)fswintrberr && p->p_addr->u_pcb.pcb_accessaddr == va) { framep->tf_cr_iip = p->p_addr->u_pcb.pcb_onfault; p->p_addr->u_pcb.pcb_onfault = 0; mtx_unlock(&Giant); goto out; } /* * It is only a kernel address space fault iff: * 1. !user and * 2. pcb_onfault not set or * 3. pcb_onfault set but kernel space data fault * The last can occur during an exec() copyin where the * argument space is lazy-allocated. * * For the purposes of the Linux emulator, we allow * kernel accesses to a small region of the * user stack which the emulator uses to * translate syscall arguments. */ if (!user && ((va >= VM_MIN_KERNEL_ADDRESS) || (p == NULL) || (p->p_addr->u_pcb.pcb_onfault == 0))) { if (va >= trunc_page(PS_STRINGS - szsigcode - SPARE_USRSPACE) && va < round_page(PS_STRINGS - szsigcode)) { vm = p->p_vmspace; map = &vm->vm_map; } else { map = kernel_map; } } else { vm = p->p_vmspace; map = &vm->vm_map; } if (framep->tf_cr_isr & IA64_ISR_X) ftype = VM_PROT_EXECUTE; else if (framep->tf_cr_isr & IA64_ISR_R) ftype = VM_PROT_READ; else ftype = VM_PROT_WRITE; va = trunc_page((vm_offset_t)va); if (map != kernel_map) { /* * Keep swapout from messing with us * during this critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process * locking or stacks in the kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } nogo:; /* * If this was a stack access we keep track of the * maximum accessed stack size. Also, if vm_fault * gets a protection failure it is due to accessing * the stack region outside the current limit and * we need to reflect that as an access error. */ if (map != kernel_map && (caddr_t)va >= vm->vm_maxsaddr && (caddr_t)va < (caddr_t)USRSTACK) { if (rv == KERN_SUCCESS) { unsigned nss; nss = ia64_btop(round_page(USRSTACK - va)); if (nss > vm->vm_ssize) vm->vm_ssize = nss; } else if (rv == KERN_PROTECTION_FAILURE) rv = KERN_INVALID_ADDRESS; } if (rv == KERN_SUCCESS) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); ucode = va; i = SIGSEGV; #ifdef DEBUG printtrap(vector, imm, framep, 1, user); #endif break; } default: goto dopanic; } #ifdef DEBUG printtrap(vector, imm, framep, 1, user); #endif trapsignal(p, i, ucode); out: if (user) { userret(p, framep, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } return; dopanic: printtrap(vector, imm, framep, 1, user); /* XXX dump registers */ #ifdef DDB kdb_trap(vector, framep); #endif panic("trap"); } /* * Process a system call. * * System calls are strange beasts. They are passed the syscall number * in r15, and the arguments in the registers (as normal). They return * an error flag in r10 (if r10 != 0 on return, the syscall had an error), * and the return value (if any) in r8 and r9. * * The assembly stub takes care of moving the call number into a register * we can get to, and moves all of the argument registers into a stack * buffer. On return, it restores r8-r10 from the frame before * returning to the user process. */ void syscall(int code, u_int64_t *args, struct trapframe *framep) { struct sysent *callp; struct proc *p; int error = 0; u_int64_t oldip, oldri; u_quad_t sticks; cnt.v_syscall++; p = curproc; p->p_md.md_tf = framep; mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); /* * Skip past the break instruction. Remember old address in case * we have to restart. */ oldip = framep->tf_cr_iip; oldri = framep->tf_cr_ipsr & IA64_PSR_RI; framep->tf_cr_ipsr += IA64_PSR_RI_1; if ((framep->tf_cr_ipsr & IA64_PSR_RI) > IA64_PSR_RI_2) { framep->tf_cr_ipsr &= ~IA64_PSR_RI; framep->tf_cr_iip += 16; } #ifdef DIAGNOSTIC ia64_fpstate_check(p); #endif if (p->p_sysent->sv_prepsyscall) { /* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, ¶ms); */ panic("prepsyscall"); } else { /* * syscall() and __syscall() are handled the same on * the ia64, as everything is 64-bit aligned, anyway. */ if (code == SYS_syscall || code == SYS___syscall) { /* * Code is first argument, followed by actual args. */ code = args[0]; args++; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, (callp->sy_narg & SYF_ARGMASK), args); #endif if (error == 0) { p->p_retval[0] = 0; p->p_retval[1] = 0; STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK)); error = (*callp->sy_call)(p, args); } switch (error) { case 0: framep->tf_r[FRAME_R8] = p->p_retval[0]; framep->tf_r[FRAME_R9] = p->p_retval[1]; framep->tf_r[FRAME_R10] = 0; break; case ERESTART: framep->tf_cr_iip = oldip; framep->tf_cr_ipsr = (framep->tf_cr_ipsr & ~IA64_PSR_RI) | oldri; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } framep->tf_r[FRAME_R8] = error; framep->tf_r[FRAME_R10] = 1; break; } userret(p, framep, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, code, error, p->p_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); mtx_unlock(&Giant); #ifdef WITNESS if (witness_list(p)) { panic("system call %s returning with mutex(s) held\n", syscallnames[code]); } #endif mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } /* * Process the tail end of a fork() for the child. */ void child_return(p) struct proc *p; { /* * Return values in the frame set by cpu_fork(). */ userret(p, p->p_md.md_tf, 0); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsysret(p->p_tracep, SYS_fork, 0, 0); } #endif if (mtx_owned(&Giant)) mtx_unlock(&Giant); } /* * Process an asynchronous software trap. * This is relatively easy. */ void ast(framep) struct trapframe *framep; { register struct proc *p; u_quad_t sticks; p = curproc; mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_tf = framep; if ((framep->tf_cr_ipsr & IA64_PSR_CPL) != IA64_PSR_CPL_USER) panic("ast and not user"); cnt.v_soft++; PCPU_SET(astpending, 0); mtx_lock_spin(&sched_lock); if (p->p_sflag & PS_OWEUPC) { p->p_sflag &= ~PS_OWEUPC; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, p->p_stats->p_prof.pr_addr, p->p_stats->p_prof.pr_ticks); } if (p->p_sflag & PS_ALRMPEND) { p->p_sflag &= ~PS_ALRMPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGVTALRM); mtx_lock_spin(&sched_lock); } if (p->p_sflag & PS_PROFPEND) { p->p_sflag &= ~PS_PROFPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGPROF); } else mtx_unlock_spin(&sched_lock); userret(p, framep, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } extern int ia64_unaligned_print, ia64_unaligned_fix; extern int ia64_unaligned_sigbus; static int unaligned_fixup(struct trapframe *framep, struct proc *p) { vm_offset_t va = framep->tf_cr_ifa; int doprint, dofix, dosigbus; int signal, size = 0; unsigned long uac; /* * Figure out what actions to take. */ if (p) uac = p->p_md.md_flags & MDP_UAC_MASK; else uac = 0; doprint = ia64_unaligned_print && !(uac & MDP_UAC_NOPRINT); dofix = ia64_unaligned_fix && !(uac & MDP_UAC_NOFIX); dosigbus = ia64_unaligned_sigbus | (uac & MDP_UAC_SIGBUS); /* * See if the user can access the memory in question. * Even if it's an unknown opcode, SEGV if the access * should have failed. */ if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) { signal = SIGSEGV; goto out; } /* * If we're supposed to be noisy, squawk now. */ if (doprint) { uprintf("pid %d (%s): unaligned access: va=0x%lx pc=0x%lx\n", p->p_pid, p->p_comm, va, p->p_md.md_tf->tf_cr_iip); } /* * If we should try to fix it and know how, give it a shot. * * We never allow bad data to be unknowingly used by the * user process. That is, if we decide not to fix up an * access we cause a SIGBUS rather than letting the user * process go on without warning. * * If we're trying to do a fixup, we assume that things * will be botched. If everything works out OK, * unaligned_{load,store}_* clears the signal flag. */ signal = SIGBUS; if (dofix && size != 0) { /* * XXX not done yet. */ } /* * Force SIGBUS if requested. */ if (dosigbus) signal = SIGBUS; out: return (signal); } Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 72375) +++ head/sys/kern/init_main.c (revision 72376) @@ -1,605 +1,606 @@ /* * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_init_path.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern struct linker_set sysinit_set; /* XXX */ void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; static struct pcred cred0; static struct procsig procsig0; static struct filedesc0 filedesc0; static struct plimit limit0; static struct vmspace vmspace0; struct proc *initproc; int cmask = CMASK; extern struct user *proc0paddr; struct vnode *rootvp; int boothowto = 0; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, ""); /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL) /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ struct sysinit **sysinit = (struct sysinit **)sysinit_set.ls_items; struct sysinit **newsysinit; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count = 0; if (newsysinit) for (sipp = newsysinit; *sipp; sipp++) count++; else for (sipp = sysinit; *sipp; sipp++) count++; for (sipp = set; *sipp; sipp++) count++; count++; /* Trailing NULL */ newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; *sipp; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; *sipp; sipp++) *xipp++ = *sipp; for (sipp = set; *sipp; sipp++) *xipp++ = *sipp; *xipp = NULL; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; } /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; *sipp; sipp++) { for (xipp = sipp + 1; *xipp; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. * * The last item on the list is expected to be the scheduler, * which will not return. */ for (sipp = sysinit; *sipp; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; /* Call function */ (*((*sipp)->func))((*sipp)->udata); /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != (struct sysinit **)sysinit_set.ls_items) free(sysinit, M_TEMP); sysinit = newsysinit; newsysinit = NULL; goto restart; } } panic("Shouldn't get here!"); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data __unused) { printf("%s", (char *)data); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) /* *************************************************************************** **** **** The two following SYSINT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { register struct proc *p; register struct filedesc0 *fdp; register unsigned i; p = &proc0; /* * Initialize magic number. */ p->p_magic = P_MAGIC; /* * Initialize process and pgrp structures. */ procinit(); /* * Initialize sleep queue hash table */ sleepinit(); /* * additional VM structures */ vm_init2(); /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; session0.s_count = 1; session0.s_leader = p; #ifdef __ELF__ p->p_sysent = &elf_freebsd_sysvec; #else p->p_sysent = &aout_sysvec; #endif p->p_flag = P_SYSTEM; p->p_sflag = PS_INMEM; p->p_stat = SRUN; p->p_nice = NZERO; - p->p_rtprio.type = RTP_PRIO_NORMAL; - p->p_rtprio.prio = 0; + p->p_pri.pri_class = PRI_TIMESHARE; + p->p_pri.pri_level = PVM; + p->p_pri.pri_user = PUSER; p->p_peers = 0; p->p_leader = p; bcopy("swapper", p->p_comm, sizeof ("swapper")); callout_init(&p->p_itcallout, 0); callout_init(&p->p_slpcallout, 1); /* Create credentials. */ cred0.p_refcnt = 1; cred0.p_uidinfo = uifind(0); p->p_cred = &cred0; p->p_ucred = crget(); p->p_ucred->cr_ngroups = 1; /* group 0 */ p->p_ucred->cr_uidinfo = uifind(0); /* Don't jail it */ p->p_prison = 0; /* Create procsig. */ p->p_procsig = &procsig0; p->p_procsig->ps_refcnt = 1; /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ fdp = &filedesc0; p->p_fd = &fdp->fd_fd; fdp->fd_fd.fd_refcnt = 1; fdp->fd_fd.fd_cmask = cmask; fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; fdp->fd_fd.fd_nfiles = NDFILE; /* Create the limits structures. */ p->p_limit = &limit0; for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) limit0.pl_rlimit[i].rlim_cur = limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; i = ptoa(cnt.v_free_count); limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; limit0.p_cpulimit = RLIM_INFINITY; limit0.p_refcnt = 1; /* Allocate a prototype map so we have something to fork. */ pmap_pinit0(vmspace_pmap(&vmspace0)); p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), trunc_page(VM_MAXUSER_ADDRESS)); vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0); p->p_addr = proc0paddr; /* XXX */ /* * We continue to place resource usage info and signal * actions in the user struct so they're pageable. */ p->p_stats = &p->p_addr->u_stats; p->p_sigacts = &p->p_addr->u_sigacts; /* * Charge root for one process. */ (void)chgproccnt(cred0.p_uidinfo, 1, 0); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; /* * Now we can look at the time, having had a chance to verify the * time from the file system. Pretend that proc0 started now. */ ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) { microtime(&p->p_stats->p_start); p->p_runtime = 0; } ALLPROC_LOCK(AP_RELEASE); microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, ""); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct proc *p; mtx_lock(&Giant); p = curproc; /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode)) panic("cannot find root vnode"); p->p_fd->fd_cdir = rootvnode; VREF(p->p_fd->fd_cdir); p->p_fd->fd_rdir = rootvnode; VOP_UNLOCK(rootvnode, 0, p); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = trunc_page(USRSTACK - PAGE_SIZE); if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = getenv("init_path")) != NULL) { strncpy(init_path, var, sizeof init_path); init_path[sizeof init_path - 1] = 0; } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)USRSTACK; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (devfs_present) { (void)subyte(--ucp, 'd'); options = 1; } if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = execve(p, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kthread_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { int error; error = fork1(&proc0, RFFDG | RFPROC | RFSTOPPED, &initproc); if (error) panic("cannot fork init: %d\n", error); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM; PROC_UNLOCK(initproc); mtx_lock_spin(&sched_lock); initproc->p_sflag |= PS_INMEM; mtx_unlock_spin(&sched_lock); cpu_set_fork_handler(initproc, start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { mtx_lock_spin(&sched_lock); initproc->p_stat = SRUN; setrunqueue(initproc); mtx_unlock_spin(&sched_lock); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) Index: head/sys/kern/kern_condvar.c =================================================================== --- head/sys/kern/kern_condvar.c (revision 72375) +++ head/sys/kern/kern_condvar.c (revision 72376) @@ -1,546 +1,542 @@ /*- * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif /* * Common sanity checks for cv_wait* functions. */ #define CV_ASSERT(cvp, mp, p) do { \ KASSERT((p) != NULL, ("%s: curproc NULL", __FUNCTION__)); \ KASSERT((p)->p_stat == SRUN, ("%s: not SRUN", __FUNCTION__)); \ KASSERT((cvp) != NULL, ("%s: cvp NULL", __FUNCTION__)); \ KASSERT((mp) != NULL, ("%s: mp NULL", __FUNCTION__)); \ mtx_assert((mp), MA_OWNED | MA_NOTRECURSED); \ } while (0) #ifdef CV_DEBUG #define CV_WAIT_VALIDATE(cvp, mp) do { \ if (TAILQ_EMPTY(&(cvp)->cv_waitq)) { \ /* Only waiter. */ \ (cvp)->cv_mtx = (mp); \ } else { \ /* \ * Other waiter; assert that we're using the \ * same mutex. \ */ \ KASSERT((cvp)->cv_mtx == (mp), \ ("%s: Multiple mutexes", __FUNCTION__)); \ } \ } while (0) #define CV_SIGNAL_VALIDATE(cvp) do { \ if (!TAILQ_EMPTY(&(cvp)->cv_waitq)) { \ KASSERT(mtx_owned((cvp)->cv_mtx), \ ("%s: Mutex not owned", __FUNCTION__)); \ } \ } while (0) #else #define CV_WAIT_VALIDATE(cvp, mp) #define CV_SIGNAL_VALIDATE(cvp) #endif static void cv_timedwait_end(void *arg); /* * Initialize a condition variable. Must be called before use. */ void cv_init(struct cv *cvp, const char *desc) { TAILQ_INIT(&cvp->cv_waitq); cvp->cv_mtx = NULL; cvp->cv_description = desc; } /* * Destroy a condition variable. The condition variable must be re-initialized * in order to be re-used. */ void cv_destroy(struct cv *cvp) { KASSERT(cv_waitq_empty(cvp), ("%s: cv_waitq non-empty", __FUNCTION__)); } /* * Common code for cv_wait* functions. All require sched_lock. */ /* * Switch context. */ static __inline void cv_switch(struct proc *p) { p->p_stat = SSLEEP; p->p_stats->p_ru.ru_nvcsw++; mi_switch(); CTR3(KTR_PROC, "cv_switch: resume proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); } /* * Switch context, catching signals. */ static __inline int cv_switch_catch(struct proc *p) { int sig; /* * We put ourselves on the sleep queue and start our timeout before * calling CURSIG, as we could stop there, and a wakeup or a SIGCONT (or * both) could occur while we were stopped. A SIGCONT would cause us to * be marked as SSLEEP without resuming us, thus we must be ready for * sleep when CURSIG is called. If the wakeup happens while we're * stopped, p->p_wchan will be 0 upon return from CURSIG. */ p->p_sflag |= PS_SINTR; mtx_unlock_spin(&sched_lock); sig = CURSIG(p); mtx_lock_spin(&sched_lock); if (sig != 0) { if (p->p_wchan != NULL) cv_waitq_remove(p); p->p_stat = SRUN; } else if (p->p_wchan != NULL) { cv_switch(p); } p->p_sflag &= ~PS_SINTR; return sig; } /* * Add a process to the wait queue of a condition variable. */ static __inline void cv_waitq_add(struct cv *cvp, struct proc *p) { /* * Process may be sitting on a slpque if asleep() was called, remove it * before re-adding. */ if (p->p_wchan != NULL) unsleep(p); p->p_sflag |= PS_CVWAITQ; p->p_wchan = cvp; p->p_wmesg = cvp->cv_description; p->p_slptime = 0; - p->p_nativepri = p->p_priority; + p->p_pri.pri_native = p->p_pri.pri_level; CTR3(KTR_PROC, "cv_waitq_add: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); TAILQ_INSERT_TAIL(&cvp->cv_waitq, p, p_slpq); } /* * Wait on a condition variable. The current process is placed on the condition * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same * condition variable will resume the process. The mutex is released before * sleeping and will be held on return. It is recommended that the mutex be * held when cv_signal or cv_broadcast are called. */ void cv_wait(struct cv *cvp, struct mtx *mp) { struct proc *p; WITNESS_SAVE_DECL(mp); p = CURPROC; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif CV_ASSERT(cvp, mp, p); WITNESS_SLEEP(0, mp); WITNESS_SAVE(mp, mp); mtx_lock_spin(&sched_lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * procs or panic below, in case this is the idle process and * already asleep. */ mtx_unlock_spin(&sched_lock); return; } CV_WAIT_VALIDATE(cvp, mp); DROP_GIANT_NOSWITCH(); mtx_unlock_flags(mp, MTX_NOSWITCH); cv_waitq_add(cvp, p); cv_switch(p); - curpriority = p->p_usrpri; mtx_unlock_spin(&sched_lock); #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif PICKUP_GIANT(); mtx_lock(mp); WITNESS_RESTORE(mp, mp); } /* * Wait on a condition variable, allowing interruption by signals. Return 0 if * the process was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if * a signal was caught. If ERESTART is returned the system call should be * restarted if possible. */ int cv_wait_sig(struct cv *cvp, struct mtx *mp) { struct proc *p; int rval; int sig; WITNESS_SAVE_DECL(mp); p = CURPROC; rval = 0; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif CV_ASSERT(cvp, mp, p); WITNESS_SLEEP(0, mp); WITNESS_SAVE(mp, mp); mtx_lock_spin(&sched_lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * procs or panic below, in case this is the idle process and * already asleep. */ mtx_unlock_spin(&sched_lock); return 0; } CV_WAIT_VALIDATE(cvp, mp); DROP_GIANT_NOSWITCH(); mtx_unlock_flags(mp, MTX_NOSWITCH); cv_waitq_add(cvp, p); sig = cv_switch_catch(p); - curpriority = p->p_usrpri; mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); /* proc_lock(p); */ if (sig == 0) sig = CURSIG(p); if (sig != 0) { if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; } /* proc_unlock(p); */ #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif mtx_lock(mp); WITNESS_RESTORE(mp, mp); return (rval); } /* * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout * expires. */ int cv_timedwait(struct cv *cvp, struct mtx *mp, int timo) { struct proc *p; int rval; WITNESS_SAVE_DECL(mp); p = CURPROC; rval = 0; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif CV_ASSERT(cvp, mp, p); WITNESS_SLEEP(0, mp); WITNESS_SAVE(mp, mp); mtx_lock_spin(&sched_lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * procs or panic below, in case this is the idle process and * already asleep. */ mtx_unlock_spin(&sched_lock); return 0; } CV_WAIT_VALIDATE(cvp, mp); DROP_GIANT_NOSWITCH(); mtx_unlock_flags(mp, MTX_NOSWITCH); cv_waitq_add(cvp, p); callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p); cv_switch(p); - curpriority = p->p_usrpri; if (p->p_sflag & PS_TIMEOUT) { p->p_sflag &= ~PS_TIMEOUT; rval = EWOULDBLOCK; } else callout_stop(&p->p_slpcallout); mtx_unlock_spin(&sched_lock); #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif PICKUP_GIANT(); mtx_lock(mp); WITNESS_RESTORE(mp, mp); return (rval); } /* * Wait on a condition variable for at most timo/hz seconds, allowing * interruption by signals. Returns 0 if the process was resumed by cv_signal * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if * a signal was caught. */ int cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo) { struct proc *p; int rval; int sig; WITNESS_SAVE_DECL(mp); p = CURPROC; rval = 0; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif CV_ASSERT(cvp, mp, p); WITNESS_SLEEP(0, mp); WITNESS_SAVE(mp, mp); mtx_lock_spin(&sched_lock); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, just give * interrupts a chance, then just return; don't run any other * procs or panic below, in case this is the idle process and * already asleep. */ mtx_unlock_spin(&sched_lock); return 0; } CV_WAIT_VALIDATE(cvp, mp); DROP_GIANT_NOSWITCH(); mtx_unlock_flags(mp, MTX_NOSWITCH); cv_waitq_add(cvp, p); callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p); sig = cv_switch_catch(p); - curpriority = p->p_usrpri; if (p->p_sflag & PS_TIMEOUT) { p->p_sflag &= ~PS_TIMEOUT; rval = EWOULDBLOCK; } else callout_stop(&p->p_slpcallout); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); /* proc_lock(p); */ if (sig == 0) sig = CURSIG(p); if (sig != 0) { if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; } /* proc_unlock(p); */ #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif mtx_lock(mp); WITNESS_RESTORE(mp, mp); return (rval); } /* * Common code for signal and broadcast. Assumes waitq is not empty. Must be * called with sched_lock held. */ static __inline void cv_wakeup(struct cv *cvp) { struct proc *p; mtx_assert(&sched_lock, MA_OWNED); p = TAILQ_FIRST(&cvp->cv_waitq); KASSERT(p->p_wchan == cvp, ("%s: bogus wchan", __FUNCTION__)); KASSERT(p->p_sflag & PS_CVWAITQ, ("%s: not on waitq", __FUNCTION__)); TAILQ_REMOVE(&cvp->cv_waitq, p, p_slpq); p->p_sflag &= ~PS_CVWAITQ; p->p_wchan = 0; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ CTR3(KTR_PROC, "cv_signal: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; p->p_stat = SRUN; if (p->p_sflag & PS_INMEM) { setrunqueue(p); maybe_resched(p); } else { p->p_sflag |= PS_SWAPINREQ; wakeup(&proc0); } /* END INLINE EXPANSION */ } } /* * Signal a condition variable, wakes up one waiting process. Will also wakeup * the swapper if the process is not in memory, so that it can bring the * sleeping process in. Note that this may also result in additional processes * being made runnable. Should be called with the same mutex as was passed to * cv_wait held. */ void cv_signal(struct cv *cvp) { KASSERT(cvp != NULL, ("%s: cvp NULL", __FUNCTION__)); mtx_lock_spin(&sched_lock); if (!TAILQ_EMPTY(&cvp->cv_waitq)) { CV_SIGNAL_VALIDATE(cvp); cv_wakeup(cvp); } mtx_unlock_spin(&sched_lock); } /* * Broadcast a signal to a condition variable. Wakes up all waiting processes. * Should be called with the same mutex as was passed to cv_wait held. */ void cv_broadcast(struct cv *cvp) { KASSERT(cvp != NULL, ("%s: cvp NULL", __FUNCTION__)); mtx_lock_spin(&sched_lock); CV_SIGNAL_VALIDATE(cvp); while (!TAILQ_EMPTY(&cvp->cv_waitq)) cv_wakeup(cvp); mtx_unlock_spin(&sched_lock); } /* * Remove a process from the wait queue of its condition variable. This may be * called externally. */ void cv_waitq_remove(struct proc *p) { struct cv *cvp; mtx_lock_spin(&sched_lock); if ((cvp = p->p_wchan) != NULL && p->p_sflag & PS_CVWAITQ) { TAILQ_REMOVE(&cvp->cv_waitq, p, p_slpq); p->p_sflag &= ~PS_CVWAITQ; p->p_wchan = NULL; } mtx_unlock_spin(&sched_lock); } /* * Timeout function for cv_timedwait. Put the process on the runqueue and set * its timeout flag. */ static void cv_timedwait_end(void *arg) { struct proc *p; p = arg; CTR3(KTR_PROC, "cv_timedwait_end: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); mtx_lock_spin(&sched_lock); if (p->p_wchan != NULL) { if (p->p_stat == SSLEEP) setrunnable(p); else cv_waitq_remove(p); p->p_sflag |= PS_TIMEOUT; } mtx_unlock_spin(&sched_lock); } Index: head/sys/kern/kern_idle.c =================================================================== --- head/sys/kern/kern_idle.c (revision 72375) +++ head/sys/kern/kern_idle.c (revision 72376) @@ -1,117 +1,119 @@ /*- * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT * * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #include #include #include static void idle_setup(void *dummy); SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL) static void idle_proc(void *dummy); /* * Setup per-cpu idle process contexts. The AP's shouldn't be running or * accessing their idle processes at this point, so don't bother with * locking. */ static void idle_setup(void *dummy) { struct globaldata *gd; int error; SLIST_FOREACH(gd, &cpuhead, gd_allcpu) { #ifdef SMP error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, RFSTOPPED|RFHIGHPID, "idle: cpu%d", gd->gd_cpuid); #else error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, RFSTOPPED|RFHIGHPID, "idle"); #endif if (error) panic("idle_setup: kthread_create error %d\n", error); gd->gd_idleproc->p_flag |= P_NOLOAD; gd->gd_idleproc->p_stat = SRUN; if (gd->gd_curproc == NULL) gd->gd_curproc = gd->gd_idleproc; } } /* * idle process context */ static void idle_proc(void *dummy) { #ifdef DIAGNOSTIC int count; #endif for (;;) { mtx_assert(&Giant, MA_NOTOWNED); #ifdef DIAGNOSTIC count = 0; while (count >= 0 && procrunnable() == 0) { #else while (procrunnable() == 0) { #endif /* * This is a good place to put things to be done in * the background, including sanity checks. */ #ifdef DIAGNOSTIC if (count++ < 0) CTR0(KTR_PROC, "idle_proc: timed out waiting" " for a process"); #endif +#if 0 if (vm_page_zero_idle() != 0) continue; +#endif #ifdef __i386__ cpu_idle(); #endif } mtx_lock_spin(&sched_lock); curproc->p_stats->p_ru.ru_nvcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); } } Index: head/sys/kern/kern_intr.c =================================================================== --- head/sys/kern/kern_intr.c (revision 72375) +++ head/sys/kern/kern_intr.c (revision 72376) @@ -1,537 +1,537 @@ /* * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* prototype for legacy_setsoftnet */ void *net_ih; void *vm_ih; void *softclock_ih; struct ithd *clk_ithd; struct ithd *tty_ithd; static struct mtx ithread_list_lock; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static void ithread_update(struct ithd *); static void ithread_loop(void *); static void ithread_init(void *); static void start_softintr(void *); static void swi_net(void *); u_char ithread_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTYLOW; break; case INTR_TYPE_BIO: /* * XXX We need to refine this. BSD/OS distinguishes * between tape and disk priorities. */ pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("ithread_priority: no interrupt type in flags"); } return pri; } /* * Regenerate the name (p_comm) and priority for a threaded interrupt thread. */ static void ithread_update(struct ithd *ithd) { struct intrhand *ih; struct proc *p; int entropy; p = ithd->it_proc; if (p == NULL) return; strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name)); ih = TAILQ_FIRST(&ithd->it_handlers); if (ih == NULL) { - p->p_rtprio.prio = RTP_PRIO_MAX; + p->p_pri.pri_level = PRI_MAX_ITHD; ithd->it_flags &= ~IT_ENTROPY; return; } entropy = 0; - p->p_rtprio.prio = ih->ih_pri; + p->p_pri.pri_level = ih->ih_pri; TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) { if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 < sizeof(p->p_comm)) { strcat(p->p_comm, " "); strcat(p->p_comm, ih->ih_name); } else if (strlen(p->p_comm) + 1 == sizeof(p->p_comm)) { if (p->p_comm[sizeof(p->p_comm) - 2] == '+') p->p_comm[sizeof(p->p_comm) - 2] = '*'; else p->p_comm[sizeof(p->p_comm) - 2] = '+'; } else strcat(p->p_comm, "+"); if (ih->ih_flags & IH_ENTROPY) entropy++; } if (entropy) { printf("Warning, ithread (%d, %s) is an entropy source.\n", p->p_pid, p->p_comm); ithd->it_flags |= IT_ENTROPY; } else ithd->it_flags &= ~IT_ENTROPY; } int ithread_create(struct ithd **ithread, int vector, int flags, void (*disable)(int), void (*enable)(int), const char *fmt, ...) { struct ithd *ithd; struct proc *p; int error; va_list ap; ithd = malloc(sizeof(struct ithd), M_ITHREAD, M_WAITOK | M_ZERO); ithd->it_vector = vector; ithd->it_disable = disable; ithd->it_enable = enable; ithd->it_flags = flags; TAILQ_INIT(&ithd->it_handlers); va_start(ap, fmt); vsnprintf(ithd->it_name, sizeof(ithd->it_name), fmt, ap); va_end(ap); error = kthread_create(ithread_loop, ithd, &p, RFSTOPPED | RFHIGHPID, ithd->it_name); if (error) { free(ithd, M_ITHREAD); return (error); } - p->p_rtprio.type = RTP_PRIO_ITHREAD; - p->p_rtprio.prio = RTP_PRIO_MAX; + p->p_pri.pri_class = PRI_ITHD; + p->p_pri.pri_level = PRI_MAX_ITHD; p->p_stat = SWAIT; ithd->it_proc = p; p->p_ithd = ithd; if (ithread != NULL) *ithread = ithd; return (0); } int ithread_destroy(struct ithd *ithread) { if (ithread == NULL || !TAILQ_EMPTY(&ithread->it_handlers)) return (EINVAL); mtx_lock_spin(&sched_lock); ithread->it_flags |= IT_DEAD; if (ithread->it_proc->p_stat == SWAIT) { ithread->it_proc->p_stat = SRUN; setrunqueue(ithread->it_proc); } mtx_unlock_spin(&sched_lock); return (0); } int ithread_add_handler(struct ithd* ithread, const char *name, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intrhand *ih, *temp_ih; if (ithread == NULL || name == NULL || handler == NULL) return (EINVAL); if ((flags & INTR_FAST) !=0) flags |= INTR_EXCL; ih = malloc(sizeof(struct intrhand), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_handler = handler; ih->ih_argument = arg; ih->ih_name = name; ih->ih_ithread = ithread; ih->ih_pri = pri; if (flags & INTR_FAST) ih->ih_flags = IH_FAST | IH_EXCLUSIVE; else if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; mtx_lock_spin(&ithread_list_lock); if ((flags & INTR_EXCL) !=0 && !TAILQ_EMPTY(&ithread->it_handlers)) goto fail; if (!TAILQ_EMPTY(&ithread->it_handlers) && (TAILQ_FIRST(&ithread->it_handlers)->ih_flags & IH_EXCLUSIVE) != 0) goto fail; TAILQ_FOREACH(temp_ih, &ithread->it_handlers, ih_next) if (temp_ih->ih_pri > ih->ih_pri) break; if (temp_ih == NULL) TAILQ_INSERT_TAIL(&ithread->it_handlers, ih, ih_next); else TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next); ithread_update(ithread); mtx_unlock_spin(&ithread_list_lock); if (cookiep != NULL) *cookiep = ih; return (0); fail: mtx_unlock_spin(&ithread_list_lock); free(ih, M_ITHREAD); return (EINVAL); } int ithread_remove_handler(void *cookie) { struct intrhand *handler = (struct intrhand *)cookie; struct ithd *ithread; #ifdef INVARIANTS struct intrhand *ih; int found; #endif if (handler == NULL || (ithread = handler->ih_ithread) == NULL) return (EINVAL); mtx_lock_spin(&ithread_list_lock); #ifdef INVARIANTS found = 0; TAILQ_FOREACH(ih, &ithread->it_handlers, ih_next) if (ih == handler) { found++; break; } if (found == 0) { mtx_unlock_spin(&ithread_list_lock); return (EINVAL); } #endif TAILQ_REMOVE(&ithread->it_handlers, handler, ih_next); ithread_update(ithread); mtx_unlock_spin(&ithread_list_lock); free(handler, M_ITHREAD); return (0); } int swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct proc *p; struct ithd *ithd; int error; ithd = (ithdp != NULL) ? *ithdp : NULL; if (ithd == NULL) { error = ithread_create(&ithd, pri, IT_SOFT, NULL, NULL, "swi%d:", pri); if (error) return (error); /* XXX - some hacks are _really_ gross */ p = ithd->it_proc; PROC_LOCK(p); if (pri == SWI_CLOCK) p->p_flag |= P_NOLOAD; PROC_UNLOCK(p); if (ithdp != NULL) *ithdp = ithd; } - return (ithread_add_handler(ithd, name, handler, arg, pri + PI_SOFT, - flags, cookiep)); + return (ithread_add_handler(ithd, name, handler, arg, + (pri * RQ_PPQ) + PI_SOFT, flags, cookiep)); } /* * Schedule a heavyweight software interrupt process. */ void swi_sched(void *cookie, int flags) { struct intrhand *ih = (struct intrhand *)cookie; struct ithd *it = ih->ih_ithread; struct proc *p = it->it_proc; atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */ CTR3(KTR_INTR, "swi_sched pid %d(%s) need=%d", p->p_pid, p->p_comm, it->it_need); /* * Set it_need so that if the thread is already running but close * to done, it will do another go-round. Then get the sched lock * and see if the thread is on whichkqs yet. If not, put it on * there. In any case, kick everyone so that if the new thread * is higher priority than their current thread, it gets run now. */ atomic_store_rel_int(&ih->ih_need, 1); if (!(flags & SWI_DELAY)) { it->it_need = 1; mtx_lock_spin(&sched_lock); if (p->p_stat == SWAIT) { /* not on run queue */ CTR1(KTR_INTR, "swi_sched: setrunqueue %d", p->p_pid); p->p_stat = SRUN; setrunqueue(p); if (!cold && flags & SWI_SWITCH) { if (curproc != PCPU_GET(idleproc)) setrunqueue(curproc); curproc->p_stats->p_ru.ru_nvcsw++; mi_switch(); } else need_resched(); } else { CTR3(KTR_INTR, "swi_sched %d: it_need %d, state %d", p->p_pid, it->it_need, p->p_stat ); } mtx_unlock_spin(&sched_lock); } } /* * This is the main code for interrupt threads. */ void ithread_loop(void *arg) { struct ithd *ithd; /* our thread context */ struct intrhand *ih; /* and our interrupt handler chain */ struct proc *p; p = curproc; ithd = (struct ithd *)arg; /* point to myself */ KASSERT(ithd->it_proc == p && p->p_ithd == ithd, (__func__ ": ithread and proc linkage out of sync")); /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR2(KTR_INTR, __func__ ": pid %d: (%s) exiting", p->p_pid, p->p_comm); p->p_ithd = NULL; mtx_lock(&Giant); free(ithd, M_ITHREAD); kthread_exit(0); } CTR3(KTR_INTR, __func__ ": pid %d: (%s) need=%d", p->p_pid, p->p_comm, ithd->it_need); while (ithd->it_need) { /* * Service interrupts. If another interrupt * arrives while we are running, they will set * it_need to denote that we should make * another pass. */ atomic_store_rel_int(&ithd->it_need, 0); TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) { if (ithd->it_flags & IT_SOFT && !ih->ih_need) continue; atomic_store_rel_int(&ih->ih_need, 0); CTR5(KTR_INTR, __func__ ": pid %d ih=%p: %p(%p) flg=%x", p->p_pid, (void *)ih, (void *)ih->ih_handler, ih->ih_argument, ih->ih_flags); if ((ih->ih_flags & IH_MPSAFE) == 0) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if ((ih->ih_flags & IH_MPSAFE) == 0) mtx_unlock(&Giant); } } /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ mtx_assert(&Giant, MA_NOTOWNED); mtx_lock_spin(&sched_lock); if (!ithd->it_need) { /* * Should we call this earlier in the loop above? */ if (ithd->it_enable != NULL) ithd->it_enable(ithd->it_vector); p->p_stat = SWAIT; /* we're idle */ CTR1(KTR_INTR, __func__ ": pid %d: done", p->p_pid); mi_switch(); CTR1(KTR_INTR, __func__ ": pid %d: resumed", p->p_pid); } mtx_unlock_spin(&sched_lock); } } /* * Initialize mutex used to protect ithread handler lists. */ static void ithread_init(void *dummy) { mtx_init(&ithread_list_lock, "ithread list lock", MTX_SPIN); } SYSINIT(ithread_init, SI_SUB_INTR, SI_ORDER_FIRST, ithread_init, NULL); /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(NULL, "net", swi_net, NULL, SWI_NET, 0, &net_ih) || swi_add(&clk_ithd, "clock", softclock, NULL, SWI_CLOCK, INTR_MPSAFE, &softclock_ih) || swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, 0, &vm_ih)) panic("died while creating standard software ithreads"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL) void legacy_setsoftnet(void) { swi_sched(net_ih, SWI_NOSWITCH); } /* * XXX: This should really be in the network code somewhere and installed * via a SI_SUB_SOFINTR, SI_ORDER_MIDDLE sysinit. */ void (*netisrs[32]) __P((void)); u_int netisr; int register_netisr(num, handler) int num; netisr_t *handler; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("register_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = handler; return (0); } int unregister_netisr(num) int num; { if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) { printf("unregister_netisr: bad isr number: %d\n", num); return (EINVAL); } netisrs[num] = NULL; return (0); } static void swi_net(void *dummy) { u_int bits; int i; bits = atomic_readandclear_int(&netisr); while ((i = ffs(bits)) != 0) { i--; if (netisrs[i] != NULL) netisrs[i](); else printf("swi_net: unregistered isr number: %d.\n", i); bits &= ~(1 << i); } } Index: head/sys/kern/kern_mib.c =================================================================== --- head/sys/kern/kern_mib.c (revision 72375) +++ head/sys/kern/kern_mib.c (revision 72376) @@ -1,255 +1,259 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 * $FreeBSD$ */ #include #include #include #include #include #include #include SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, "Sysctl internal magic"); SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, "High kernel, proc, limits &c"); SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, "Virtual memory"); SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0, "File system"); SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0, "Network, (see socket.h)"); SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0, "Debugging"); SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0, "Sizeof various things"); SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0, "hardware"); SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0, "machine dependent"); SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, "user-level"); SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0, "p1003_1b, (see p1003_1b.h)"); SYSCTL_NODE(, OID_AUTO, compat, CTLFLAG_RW, 0, "Compatibility code"); SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "Operating system type"); SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "Operating system revision"); SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "Kernel version"); SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "Operating system type"); extern int osreldate; SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "Operating system release date"); SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, &maxproc, 0, "Maximum number of processes"); SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, &maxprocperuid, 0, "Maximum processes allowed per userid"); SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "Maximum bytes of argument to execve(2)"); SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to"); SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "Maximum number of groups a user can belong to"); SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "Whether job control is available"); #ifdef _POSIX_SAVED_IDS SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "Whether saved set-group/user ID is available"); #else SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "Whether saved set-group/user ID is available"); #endif char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, kernelname, sizeof kernelname, "Name of kernel file booted"); #ifdef SMP SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, &mp_ncpus, 0, "Number of active CPUs"); #else SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "Number of active CPUs"); #endif SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "System byte order"); SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "System memory page size"); static char machine_arch[] = MACHINE_ARCH; SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, machine_arch, 0, "System architecture"); char hostname[MAXHOSTNAMELEN]; static int sysctl_hostname(SYSCTL_HANDLER_ARGS) { int error; if (req->p->p_prison) { if (!jail_set_hostname_allowed && req->newptr) return(EPERM); error = sysctl_handle_string(oidp, req->p->p_prison->pr_host, sizeof req->p->p_prison->pr_host, req); } else error = sysctl_handle_string(oidp, hostname, sizeof hostname, req); return (error); } SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_hostname, "A", "Hostname"); int securelevel = -1; static int sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS) { int error, level; level = securelevel; error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); if (level < securelevel) return (EPERM); securelevel = level; return (error); } SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_securelvl, "I", "Current secure level"); char domainname[MAXHOSTNAMELEN]; SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, &domainname, sizeof(domainname), "Name of the current YP/NIS domain"); long hostid; /* Some trouble here, if sizeof (int) != sizeof (long) */ SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID"); /* * This is really cheating. These actually live in the libc, something * which I'm not quite sure is a good idea anyway, but in order for * getnext and friends to actually work, we define dummies here. */ SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "PATH that finds all the standard utilities"); SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "Max ibase/obase values in bc(1)"); SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "Max array size in bc(1)"); SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "Max scale value in bc(1)"); SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "Max string length in bc(1)"); SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry"); SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, ""); SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "Max length (bytes) of a text-processing utility's input line"); SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "Maximum number of repeats of a regexp permitted"); SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "The version of POSIX 1003.2 with which the system attempts to comply"); SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "Whether C development supports the C bindings option"); SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "Whether system supports the C development utilities option"); SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, ""); SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "Whether system supports FORTRAN development utilities"); SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "Whether system supports FORTRAN runtime utilities"); SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "Whether system supports creation of locales"); SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "Whether system supports software development utilities"); SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "Whether system supports the user portability utilities"); SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "Min Maximum number of streams a process may have open at one time"); SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "Min Maximum number of types supported for timezone names"); #include SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD, 0, sizeof(struct vnode), "sizeof(struct vnode)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD, 0, sizeof(struct proc), "sizeof(struct proc)"); #include SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD, 0, sizeof(struct specinfo), "sizeof(struct specinfo)"); #include #include SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD, 0, sizeof(struct bio), "sizeof(struct bio)"); SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), "sizeof(struct buf)"); + +#include +SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD, + 0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)"); Index: head/sys/kern/kern_mutex.c =================================================================== --- head/sys/kern/kern_mutex.c (revision 72375) +++ head/sys/kern/kern_mutex.c (revision 72376) @@ -1,1705 +1,1680 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ * $FreeBSD$ */ /* * Machine independent bits of mutex implementation and implementation of * `witness' structure & related debugging routines. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ #include "opt_ddb.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The WITNESS-enabled mutex debug structure. */ #ifdef WITNESS struct mtx_debug { struct witness *mtxd_witness; LIST_ENTRY(mtx) mtxd_held; const char *mtxd_file; int mtxd_line; }; #define mtx_held mtx_debug->mtxd_held #define mtx_file mtx_debug->mtxd_file #define mtx_line mtx_debug->mtxd_line #define mtx_witness mtx_debug->mtxd_witness #endif /* WITNESS */ /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_owner(m) (mtx_unowned((m)) ? NULL \ : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK)) #define RETIP(x) *(((uintptr_t *)(&x)) - 1) -#define SET_PRIO(p, pri) (p)->p_priority = (pri) +#define SET_PRIO(p, pri) (p)->p_pri.pri_level = (pri) /* * Early WITNESS-enabled declarations. */ #ifdef WITNESS /* * Internal WITNESS routines which must be prototyped early. * * XXX: When/if witness code is cleaned up, it would be wise to place all * witness prototyping early in this file. */ static void witness_init(struct mtx *, int flag); static void witness_destroy(struct mtx *); static void witness_display(void(*)(const char *fmt, ...)); MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure"); /* All mutexes in system (used for debug/panic) */ static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 }; /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; #else /* WITNESS */ /* XXX XXX XXX * flag++ is sleazoid way of shuting up warning */ #define witness_init(m, flag) flag++ #define witness_destroy(m) #define witness_try_enter(m, t, f, l) #endif /* WITNESS */ /* * All mutex locks in system are kept on the all_mtx list. */ static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head", TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), { NULL, NULL }, &all_mtx, &all_mtx, #ifdef WITNESS &all_mtx_debug #else NULL #endif }; /* * Global variables for book keeping. */ static int mtx_cur_cnt; static int mtx_max_cnt; /* * Couple of strings for KTR_LOCK tracing in order to avoid duplicates. */ char STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d"; /* * Prototypes for non-exported routines. * * NOTE: Prototypes for witness routines are placed at the bottom of the file. */ static void propagate_priority(struct proc *); static void propagate_priority(struct proc *p) { - int pri = p->p_priority; + int pri = p->p_pri.pri_level; struct mtx *m = p->p_blocked; mtx_assert(&sched_lock, MA_OWNED); for (;;) { struct proc *p1; p = mtx_owner(m); if (p == NULL) { /* * This really isn't quite right. Really * ought to bump priority of process that * next acquires the mutex. */ MPASS(m->mtx_lock == MTX_CONTESTED); return; } MPASS(p->p_magic == P_MAGIC); KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex")); - if (p->p_priority <= pri) + if (p->p_pri.pri_level <= pri) return; /* * Bump this process' priority. */ SET_PRIO(p, pri); /* * If lock holder is actually running, just bump priority. */ -#ifdef SMP - /* - * For SMP, we can check the p_oncpu field to see if we are - * running. - */ if (p->p_oncpu != 0xff) { MPASS(p->p_stat == SRUN || p->p_stat == SZOMB); return; } -#else + /* - * For UP, we check to see if p is curproc (this shouldn't - * ever happen however as it would mean we are in a deadlock.) - */ - if (p == curproc) { - panic("Deadlock detected"); - return; - } -#endif - /* * If on run queue move to new run queue, and * quit. */ if (p->p_stat == SRUN) { - printf("XXX: moving proc %d(%s) to a new run queue\n", - p->p_pid, p->p_comm); MPASS(p->p_blocked == NULL); remrunqueue(p); setrunqueue(p); return; } /* * If we aren't blocked on a mutex, we should be. */ KASSERT(p->p_stat == SMTX, ( "process %d(%s):%d holds %s but isn't blocked on a mutex\n", p->p_pid, p->p_comm, p->p_stat, m->mtx_description)); /* * Pick up the mutex that p is blocked on. */ m = p->p_blocked; MPASS(m != NULL); - printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid, - p->p_comm, m->mtx_description); - /* * Check if the proc needs to be moved up on * the blocked chain */ if (p == TAILQ_FIRST(&m->mtx_blocked)) { - printf("XXX: process at head of run queue\n"); continue; } - p1 = TAILQ_PREV(p, rq, p_procq); - if (p1->p_priority <= pri) { - printf( - "XXX: previous process %d(%s) has higher priority\n", - p->p_pid, p->p_comm); + p1 = TAILQ_PREV(p, procqueue, p_procq); + if (p1->p_pri.pri_level <= pri) { continue; } /* * Remove proc from blocked chain and determine where * it should be moved up to. Since we know that p1 has * a lower priority than p, we know that at least one * process in the chain has a lower priority and that * p1 will thus not be NULL after the loop. */ TAILQ_REMOVE(&m->mtx_blocked, p, p_procq); TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) { MPASS(p1->p_magic == P_MAGIC); - if (p1->p_priority > pri) + if (p1->p_pri.pri_level > pri) break; } MPASS(p1 != NULL); TAILQ_INSERT_BEFORE(p1, p, p_procq); CTR4(KTR_LOCK, "propagate_priority: p %p moved before %p on [%p] %s", p, p1, m, m->mtx_description); } } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that * if we're called, it's because we know we don't already own this lock. */ int _mtx_trylock(struct mtx *m, int opts, const char *file, int line) { int rval; MPASS(CURPROC != NULL); /* * _mtx_trylock does not accept MTX_NOSWITCH option. */ KASSERT((opts & MTX_NOSWITCH) == 0, ("mtx_trylock() called with invalid option flag(s) %d", opts)); rval = _obtain_lock(m, CURTHD); #ifdef WITNESS if (rval && m->mtx_witness != NULL) { /* * We do not handle recursion in _mtx_trylock; see the * note at the top of the routine. */ KASSERT(!mtx_recursed(m), ("mtx_trylock() called on a recursed mutex")); witness_try_enter(m, (opts | m->mtx_flags), file, line); } #endif /* WITNESS */ if ((opts & MTX_QUIET) == 0) CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d", m->mtx_description, m, rval, file, line); return rval; } /* * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p = CURPROC; if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]", m, (void *)m->mtx_lock, (void *)RETIP(m)); /* * Save our priority. Even though p_nativepri is protected by * sched_lock, we don't obtain it here as it can be expensive. * Since this is the only place p_nativepri is set, and since two * CPUs will not be executing the same process concurrently, we know * that no other CPU is going to be messing with this. Also, * p_nativepri is only read when we are blocked on a mutex, so that * can't be happening right now either. */ - p->p_nativepri = p->p_priority; + p->p_pri.pri_native = p->p_pri.pri_level; while (!_obtain_lock(m, p)) { uintptr_t v; struct proc *p1; mtx_lock_spin(&sched_lock); /* * Check if the lock has been released while spinning for * the sched_lock. */ if ((v = m->mtx_lock) == MTX_UNOWNED) { mtx_unlock_spin(&sched_lock); continue; } /* * The mutex was marked contested on release. This means that * there are processes blocked on it. */ if (v == MTX_CONTESTED) { p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p1 != NULL); m->mtx_lock = (uintptr_t)p | MTX_CONTESTED; - if (p1->p_priority < p->p_priority) - SET_PRIO(p, p1->p_priority); + if (p1->p_pri.pri_level < p->p_pri.pri_level) + SET_PRIO(p, p1->p_pri.pri_level); mtx_unlock_spin(&sched_lock); return; } /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, (void *)(v | MTX_CONTESTED))) { mtx_unlock_spin(&sched_lock); continue; } /* * We deffinately must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef notyet /* * If we're borrowing an interrupted thread's VM context, we * must clean up before going to sleep. */ if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_lock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif /* * Put us on the list of threads blocked on this mutex. */ if (TAILQ_EMPTY(&m->mtx_blocked)) { p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK); LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested); TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } else { TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) - if (p1->p_priority > p->p_priority) + if (p1->p_pri.pri_level > p->p_pri.pri_level) break; if (p1) TAILQ_INSERT_BEFORE(p1, p, p_procq); else TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } /* * Save who we're blocked on. */ p->p_blocked = m; p->p_mtxname = m->mtx_description; p->p_stat = SMTX; -#if 0 propagate_priority(p); -#endif if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m, m->mtx_description); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p free from blocked on [%p] %s", p, m, m->mtx_description); mtx_unlock_spin(&sched_lock); } return; } /* * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file, int line) { int i = 0; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); for (;;) { if (_obtain_lock(m, CURPROC)) break; while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 1000000) continue; if (i++ < 6000000) DELAY(1); #ifdef DDB else if (!db_active) #else else #endif panic("spin lock %s held by %p for > 5 seconds", m->mtx_description, (void *)m->mtx_lock); } } m->mtx_saveintr = mtx_intr; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); return; } /* * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p, *p1; struct mtx *m1; int pri; p = CURPROC; MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line); if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } mtx_lock_spin(&sched_lock); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p->p_magic == P_MAGIC); MPASS(p1->p_magic == P_MAGIC); TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq); if (TAILQ_EMPTY(&m->mtx_blocked)) { LIST_REMOVE(m, mtx_contested); _release_lock_quick(m); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); } else atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED); - pri = MAXPRI; + pri = PRI_MAX; LIST_FOREACH(m1, &p->p_contested, mtx_contested) { - int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority; + int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level; if (cp < pri) pri = cp; } - if (pri > p->p_nativepri) - pri = p->p_nativepri; + if (pri > p->p_pri.pri_native) + pri = p->p_pri.pri_native; SET_PRIO(p, pri); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p", m, p1); p1->p_blocked = NULL; p1->p_mtxname = NULL; p1->p_stat = SRUN; setrunqueue(p1); - if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) { + if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) { #ifdef notyet if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif setrunqueue(p); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p switching out lock=%p", m, (void *)m->mtx_lock); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } mtx_unlock_spin(&sched_lock); return; } /* * All the unlocking of MTX_SPIN locks is done inline. * See the _rel_spin_lock() macro for the details. */ /* * The INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANTS void _mtx_assert(struct mtx *m, int what, const char *file, int line) { switch ((what)) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned((m))) panic("mutex %s not owned at %s:%d", (m)->mtx_description, file, line); if (mtx_recursed((m))) { if (((what) & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", (m)->mtx_description, file, line); } else if (((what) & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", (m)->mtx_description, file, line); } break; case MA_NOTOWNED: if (mtx_owned((m))) panic("mutex %s owned at %s:%d", (m)->mtx_description, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * The MUTEX_DEBUG-enabled mtx_validate() */ #define MV_DESTROY 0 /* validate before destory */ #define MV_INIT 1 /* validate before init */ #ifdef MUTEX_DEBUG int mtx_validate __P((struct mtx *, int)); int mtx_validate(struct mtx *m, int when) { struct mtx *mp; int i; int retval = 0; #ifdef WITNESS if (witness_cold) return 0; #endif if (m == &all_mtx || cold) return 0; mtx_lock(&all_mtx); /* * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly * we can re-enable the kernacc() checks. */ #ifndef __alpha__ MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t), VM_PROT_READ) == 1); #endif MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx); for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { #ifndef __alpha__ if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t), VM_PROT_READ) != 1) { panic("mtx_validate: mp=%p mp->mtx_next=%p", mp, mp->mtx_next); } #endif i++; if (i > mtx_cur_cnt) { panic("mtx_validate: too many in chain, known=%d\n", mtx_cur_cnt); } } MPASS(i == mtx_cur_cnt); switch (when) { case MV_DESTROY: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) break; MPASS(mp == m); break; case MV_INIT: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) { /* * Not good. This mutex already exists. */ printf("re-initing existing mutex %s\n", m->mtx_description); MPASS(m->mtx_lock == MTX_UNOWNED); retval = 1; } } mtx_unlock(&all_mtx); return (retval); } #endif /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and description `description.' * Place on "all_mtx" queue. */ void mtx_init(struct mtx *m, const char *description, int opts) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description); #ifdef MUTEX_DEBUG /* Diagnostic and error correction */ if (mtx_validate(m, MV_INIT)) return; #endif bzero((void *)m, sizeof *m); TAILQ_INIT(&m->mtx_blocked); #ifdef WITNESS if (!witness_cold) { m->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(m->mtx_debug != NULL); } #endif m->mtx_description = description; m->mtx_flags = opts; m->mtx_lock = MTX_UNOWNED; /* Put on all mutex queue */ mtx_lock(&all_mtx); m->mtx_next = &all_mtx; m->mtx_prev = all_mtx.mtx_prev; m->mtx_prev->mtx_next = m; all_mtx.mtx_prev = m; if (++mtx_cur_cnt > mtx_max_cnt) mtx_max_cnt = mtx_cur_cnt; mtx_unlock(&all_mtx); #ifdef WITNESS if (!witness_cold) witness_init(m, opts); #endif } /* * Remove lock `m' from all_mtx queue. */ void mtx_destroy(struct mtx *m) { #ifdef WITNESS KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n", __FUNCTION__)); #endif CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description); #ifdef MUTEX_DEBUG if (m->mtx_next == NULL) panic("mtx_destroy: %p (%s) already destroyed", m, m->mtx_description); if (!mtx_owned(m)) { MPASS(m->mtx_lock == MTX_UNOWNED); } else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); } /* diagnostic */ mtx_validate(m, MV_DESTROY); #endif #ifdef WITNESS if (m->mtx_witness) witness_destroy(m); #endif /* WITNESS */ /* Remove from the all mutex queue */ mtx_lock(&all_mtx); m->mtx_next->mtx_prev = m->mtx_prev; m->mtx_prev->mtx_next = m->mtx_next; #ifdef MUTEX_DEBUG m->mtx_next = m->mtx_prev = NULL; #endif #ifdef WITNESS free(m->mtx_debug, M_WITNESS); m->mtx_debug = NULL; #endif mtx_cur_cnt--; mtx_unlock(&all_mtx); } /* * The WITNESS-enabled diagnostic code. */ #ifdef WITNESS static void witness_fixup(void *dummy __unused) { struct mtx *mp; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); mtx_lock(&all_mtx); /* Iterate through all mutexes and finish up mutex initialization. */ for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { mp->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(mp->mtx_debug != NULL); witness_init(mp, mp->mtx_flags); } mtx_unlock(&all_mtx); /* Mark the witness code as being ready for use. */ atomic_store_rel_int(&witness_cold, 0); mtx_lock(&Giant); } SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL) #define WITNESS_COUNT 200 #define WITNESS_NCHILDREN 2 int witness_watch = 1; struct witness { struct witness *w_next; const char *w_description; const char *w_file; int w_line; struct witness *w_morechildren; u_char w_childcnt; u_char w_Giant_squawked:1; u_char w_other_squawked:1; u_char w_same_squawked:1; u_char w_spin:1; /* MTX_SPIN type mutex. */ u_int w_level; struct witness *w_children[WITNESS_NCHILDREN]; }; struct witness_blessed { char *b_lock1; char *b_lock2; }; #ifdef DDB /* * When DDB is enabled and witness_ddb is set to 1, it will cause the system to * drop into kdebug() when: * - a lock heirarchy violation occurs * - locks are held when going to sleep. */ int witness_ddb; #ifdef WITNESS_DDB TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb); #else TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb); #endif SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, ""); #endif /* DDB */ int witness_skipspin; #ifdef WITNESS_SKIPSPIN TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin); #else TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin); #endif SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0, ""); /* * Witness-enabled globals */ static struct mtx w_mtx; static struct witness *w_free; static struct witness *w_all; static int w_inited; static int witness_dead; /* fatal error, probably no memory */ static struct witness w_data[WITNESS_COUNT]; /* * Internal witness routine prototypes */ static struct witness *enroll(const char *description, int flag); static int itismychild(struct witness *parent, struct witness *child); static void removechild(struct witness *parent, struct witness *child); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static int dup_ok(struct witness *); static int blessed(struct witness *, struct witness *); static void witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *); static void witness_leveldescendents(struct witness *parent, int level); static void witness_levelall(void); static struct witness * witness_get(void); static void witness_free(struct witness *m); static char *ignore_list[] = { "witness lock", NULL }; static char *spin_order_list[] = { #if defined(__i386__) && defined (SMP) "com", #endif "sio", #ifdef __i386__ "cy", #endif "sched lock", #ifdef __i386__ "clk", #endif "callout", /* * leaf locks */ "ithread table lock", "ithread list lock", #ifdef SMP #ifdef __i386__ "ap boot", "imen", #endif "smp rendezvous", #endif NULL }; static char *order_list[] = { "Giant", "proctree", "allproc", "process lock", "uidinfo hash", "uidinfo struct", NULL, NULL }; static char *dup_list[] = { NULL }; static char *sleep_list[] = { "Giant", NULL }; /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed); static void witness_init(struct mtx *m, int flag) { m->mtx_witness = enroll(m->mtx_description, flag); } static void witness_destroy(struct mtx *m) { struct mtx *m1; struct proc *p; p = CURPROC; LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) { if (m1 == m) { LIST_REMOVE(m, mtx_held); break; } } return; } static void witness_display(void(*prnt)(const char *fmt, ...)) { struct witness *w, *w1; int level, found; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); witness_levelall(); /* * First, handle sleep mutexes which have been acquired at least * once. */ prnt("Sleep mutexes:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; /* * This lock has no anscestors, display its descendants. */ witness_displaydescendants(prnt, w); } /* * Now do spin mutexes which have been acquired at least once. */ prnt("\nSpin mutexes:\n"); level = 0; while (level < sizeof(spin_order_list) / sizeof(char *)) { found = 0; for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || !w->w_spin) continue; if (w->w_level == 1 << level) { witness_displaydescendants(prnt, w); level++; found = 1; } } if (found == 0) level++; } /* * Finally, any mutexes which have not been acquired yet. */ prnt("\nMutexes which were never acquired:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file != NULL) continue; prnt("%s\n", w->w_description); } } void witness_enter(struct mtx *m, int flags, const char *file, int line) { struct witness *w, *w1; struct mtx *m1; struct proc *p; int i; #ifdef DDB int go_into_ddb = 0; #endif /* DDB */ if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; p = CURPROC; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); i = PCPU_GET(witness_spin_check); if (i != 0 && w->w_level < i) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); panic("mutex_enter(%s:%x, MTX_SPIN) out of order @" " %s:%d already holding %s:%x", m->mtx_description, w->w_level, file, line, spin_order_list[ffs(i)-1], i); } PCPU_SET(witness_spin_check, i | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if (witness_dead) goto out; if (cold) goto out; if (!mtx_legal2block()) panic("blockable mtx_lock() of %s when not legal @ %s:%d", m->mtx_description, file, line); /* * Is this the first mutex acquired */ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) goto out; if ((w1 = m1->mtx_witness) == w) { if (w->w_same_squawked || dup_ok(w)) goto out; w->w_same_squawked = 1; printf("acquring duplicate lock of same type: \"%s\"\n", m->mtx_description); printf(" 1st @ %s:%d\n", w->w_file, w->w_line); printf(" 2nd @ %s:%d\n", file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } MPASS(!mtx_owned(&w_mtx)); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); /* * If we have a known higher number just say ok */ if (witness_watch > 1 && w->w_level > w1->w_level) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } if (isitmydescendant(m1->mtx_witness, w)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { MPASS(i < 200); w1 = m1->mtx_witness; if (isitmydescendant(w, w1)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (blessed(w, w1)) goto out; if (m1 == &Giant) { if (w1->w_Giant_squawked) goto out; else w1->w_Giant_squawked = 1; } else { if (w1->w_other_squawked) goto out; else w1->w_other_squawked = 1; } printf("lock order reversal\n"); printf(" 1st %s last acquired @ %s:%d\n", w->w_description, w->w_file, w->w_line); printf(" 2nd %p %s @ %s:%d\n", m1, w1->w_description, w1->w_file, w1->w_line); printf(" 3rd %p %s @ %s:%d\n", m, w->w_description, file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } } m1 = LIST_FIRST(&p->p_heldmtx); if (!itismychild(m1->mtx_witness, w)) mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); out: #ifdef DDB if (witness_ddb && go_into_ddb) Debugger("witness_enter"); #endif /* DDB */ w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; /* * If this pays off it likely means that a mutex being witnessed * is acquired in hardclock. Put it in the ignore list. It is * likely not the mutex this assert fails on. */ MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_try_enter(struct mtx *m, int flags, const char *file, int line) { struct proc *p; struct witness *w = m->mtx_witness; if (witness_cold) return; if (panicstr) return; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_try_enter: " "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on" " non-recursive mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; p = CURPROC; MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_exit(struct mtx *m, int flags, const char *file, int line) { struct witness *w; if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) & ~w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) panic("switchable mtx_unlock() of %s when not legal @ %s:%d", m->mtx_description, file, line); LIST_REMOVE(m, mtx_held); m->mtx_held.le_prev = NULL; } int witness_sleep(int check_only, struct mtx *mtx, const char *file, int line) { struct mtx *m; struct proc *p; char **sleep; int n = 0; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); p = CURPROC; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { if (m == mtx) continue; for (sleep = sleep_list; *sleep!= NULL; sleep++) if (strcmp(m->mtx_description, *sleep) == 0) goto next; if (n == 0) printf("Whee!\n"); printf("%s:%d: %s with \"%s\" locked from %s:%d\n", file, line, check_only ? "could sleep" : "sleeping", m->mtx_description, m->mtx_witness->w_file, m->mtx_witness->w_line); n++; next: } #ifdef DDB if (witness_ddb && n) Debugger("witness_sleep"); #endif /* DDB */ return (n); } static struct witness * enroll(const char *description, int flag) { int i; struct witness *w, *w1; char **ignore; char **order; if (!witness_watch) return (NULL); for (ignore = ignore_list; *ignore != NULL; ignore++) if (strcmp(description, *ignore) == 0) return (NULL); if (w_inited == 0) { mtx_init(&w_mtx, "witness lock", MTX_SPIN); for (i = 0; i < WITNESS_COUNT; i++) { w = &w_data[i]; witness_free(w); } w_inited = 1; for (order = order_list; *order != NULL; order++) { w = enroll(*order, MTX_DEF); w->w_file = "order list"; for (order++; *order != NULL; order++) { w1 = enroll(*order, MTX_DEF); w1->w_file = "order list"; itismychild(w, w1); w = w1; } } } if ((flag & MTX_SPIN) && witness_skipspin) return (NULL); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); for (w = w_all; w; w = w->w_next) { if (strcmp(description, w->w_description) == 0) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return (w); } } if ((w = witness_get()) == NULL) return (NULL); w->w_next = w_all; w_all = w; w->w_description = description; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (flag & MTX_SPIN) { w->w_spin = 1; i = 1; for (order = spin_order_list; *order != NULL; order++) { if (strcmp(description, *order) == 0) break; i <<= 1; } if (*order == NULL) panic("spin lock %s not in order list", description); w->w_level = i; } return (w); } static int itismychild(struct witness *parent, struct witness *child) { static int recursed; /* * Insert "child" after "parent" */ while (parent->w_morechildren) parent = parent->w_morechildren; if (parent->w_childcnt == WITNESS_NCHILDREN) { if ((parent->w_morechildren = witness_get()) == NULL) return (1); parent = parent->w_morechildren; } MPASS(child != NULL); parent->w_children[parent->w_childcnt++] = child; /* * now prune whole tree */ if (recursed) return (0); recursed = 1; for (child = w_all; child != NULL; child = child->w_next) { for (parent = w_all; parent != NULL; parent = parent->w_next) { if (!isitmychild(parent, child)) continue; removechild(parent, child); if (isitmydescendant(parent, child)) continue; itismychild(parent, child); } } recursed = 0; witness_levelall(); return (0); } static void removechild(struct witness *parent, struct witness *child) { struct witness *w, *w1; int i; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) if (w->w_children[i] == child) goto found; return; found: for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) continue; w->w_children[i] = w1->w_children[--w1->w_childcnt]; MPASS(w->w_children[i] != NULL); if (w1->w_childcnt != 0) return; if (w1 == parent) return; for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) continue; w->w_morechildren = 0; witness_free(w1); } static int isitmychild(struct witness *parent, struct witness *child) { struct witness *w; int i; for (w = parent; w != NULL; w = w->w_morechildren) { for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } } return (0); } static int isitmydescendant(struct witness *parent, struct witness *child) { struct witness *w; int i; int j; for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { MPASS(j < 1000); for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } for (i = 0; i < w->w_childcnt; i++) { if (isitmydescendant(w->w_children[i], child)) return (1); } } return (0); } void witness_levelall (void) { struct witness *w, *w1; for (w = w_all; w; w = w->w_next) if (!(w->w_spin)) w->w_level = 0; for (w = w_all; w; w = w->w_next) { if (w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; witness_leveldescendents(w, 0); } } static void witness_leveldescendents(struct witness *parent, int level) { int i; struct witness *w; if (parent->w_level < level) parent->w_level = level; level++; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_leveldescendents(w->w_children[i], level); } static void witness_displaydescendants(void(*prnt)(const char *fmt, ...), struct witness *parent) { struct witness *w; int i; int level; level = parent->w_spin ? ffs(parent->w_level) : parent->w_level; prnt("%d", level); if (level < 10) prnt(" "); for (i = 0; i < level; i++) prnt(" "); prnt("%s", parent->w_description); if (parent->w_file != NULL) prnt(" -- last acquired @ %s:%d\n", parent->w_file, parent->w_line); for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_displaydescendants(prnt, w->w_children[i]); } static int dup_ok(struct witness *w) { char **dup; for (dup = dup_list; *dup!= NULL; dup++) if (strcmp(w->w_description, *dup) == 0) return (1); return (0); } static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < blessed_count; i++) { b = &blessed_list[i]; if (strcmp(w1->w_description, b->b_lock1) == 0) { if (strcmp(w2->w_description, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_description, b->b_lock2) == 0) if (strcmp(w2->w_description, b->b_lock1) == 0) return (1); } return (0); } static struct witness * witness_get() { struct witness *w; if ((w = w_free) == NULL) { witness_dead = 1; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); printf("witness exhausted\n"); return (NULL); } w_free = w->w_next; bzero(w, sizeof(*w)); return (w); } static void witness_free(struct witness *w) { w->w_next = w_free; w_free = w; } int witness_list(struct proc *p) { struct mtx *m; int nheld; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); nheld = 0; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { printf("\t\"%s\" (%p) locked at %s:%d\n", m->mtx_description, m, m->mtx_witness->w_file, m->mtx_witness->w_line); nheld++; } return (nheld); } #ifdef DDB DB_SHOW_COMMAND(mutexes, db_witness_list) { witness_list(CURPROC); } DB_SHOW_COMMAND(witness, db_witness_display) { witness_display(db_printf); } #endif void witness_save(struct mtx *m, const char **filep, int *linep) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; *filep = m->mtx_witness->w_file; *linep = m->mtx_witness->w_line; } void witness_restore(struct mtx *m, const char *file, int line) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; m->mtx_witness->w_file = file; m->mtx_witness->w_line = line; } #endif /* WITNESS */ Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 72375) +++ head/sys/kern/kern_proc.c (revision 72376) @@ -1,699 +1,696 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); int ps_showallprocs = 1; SYSCTL_INT(_kern, OID_AUTO, ps_showallprocs, CTLFLAG_RW, &ps_showallprocs, 0, ""); static void pgdelete __P((struct pgrp *)); static void orphanpg __P((struct pgrp *pg)); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct lock allproc_lock; struct lock proctree_lock; vm_zone_t proc_zone; vm_zone_t ithread_zone; /* * Initialize global process hashing structures. */ void procinit() { lockinit(&allproc_lock, PZERO, "allproc", 0, 0); lockinit(&proctree_lock, PZERO, "proctree", 0, 0); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = zinit("PROC", sizeof (struct proc), 0, 0, 5); uihashinit(); /* * This should really be a compile time warning, but I do * not know of any way to do that... */ if (sizeof(struct kinfo_proc) != KINFO_PROC_SIZE) printf("WARNING: size of kinfo_proc (%ld) should be %d!!!\n", (long)sizeof(struct kinfo_proc), KINFO_PROC_SIZE); } /* * Is p an inferior of the current process? */ int inferior(p) register struct proc *p; { int rval = 1; PROCTREE_LOCK(PT_SHARED); for (; p != curproc; p = p->p_pptr) if (p->p_pid == 0) { rval = 0; break; } PROCTREE_LOCK(PT_RELEASE); return (rval); } /* * Locate a process by number */ struct proc * pfind(pid) register pid_t pid; { register struct proc *p; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, PIDHASH(pid), p_hash) if (p->p_pid == pid) break; ALLPROC_LOCK(AP_RELEASE); return (p); } /* * Locate a process group by number */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) if (pgrp->pg_id == pgid) return (pgrp); return (NULL); } /* * Move p to a new or existing process group (and session) */ int enterpgrp(p, pgid, mksess) register struct proc *p; pid_t pgid; int mksess; { register struct pgrp *pgrp = pgfind(pgid); KASSERT(pgrp == NULL || !mksess, ("enterpgrp: setsid into non-empty pgrp")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); if (pgrp == NULL) { pid_t savepid = p->p_pid; struct proc *np; /* * new process group */ KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK); if ((np = pfind(savepid)) == NULL || np != p) return (ESRCH); if (mksess) { register struct session *sess; /* * new session */ MALLOC(sess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK); sess->s_leader = p; sess->s_sid = p->p_pid; sess->s_count = 1; sess->s_ttyvp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); p->p_flag &= ~P_CONTROLT; pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; pgrp->pg_session->s_count++; } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); } else if (pgrp == p->p_pgrp) return (0); /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); LIST_REMOVE(p, p_pglist); if (LIST_EMPTY(&p->p_pgrp->pg_members)) pgdelete(p->p_pgrp); p->p_pgrp = pgrp; LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); return (0); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { LIST_REMOVE(p, p_pglist); if (LIST_EMPTY(&p->p_pgrp->pg_members)) pgdelete(p->p_pgrp); p->p_pgrp = 0; return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); if (--pgrp->pg_session->s_count == 0) FREE(pgrp->pg_session, M_SESSION); FREE(pgrp, M_PGRP); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession = pgrp->pg_session; /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ PROCTREE_LOCK(PT_SHARED); if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) { if (entering) pgrp->pg_jobc++; else if (--pgrp->pg_jobc == 0) orphanpg(pgrp); } /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) if ((hispgrp = p->p_pgrp) != pgrp && hispgrp->pg_session == mysession && p->p_stat != SZOMB) { if (entering) hispgrp->pg_jobc++; else if (--hispgrp->pg_jobc == 0) orphanpg(hispgrp); } PROCTREE_LOCK(PT_RELEASE); } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (p->p_stat == SSTOP) { LIST_FOREACH(p, &pg->pg_members, p_pglist) { psignal(p, SIGHUP); psignal(p, SIGCONT); } return; } } } #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Fill in an kinfo_proc structure for the specified process. */ void fill_kinfo_proc(p, kp) struct proc *p; struct kinfo_proc *kp; { struct tty *tp; struct session *sp; bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; PROC_LOCK(p); kp->ki_addr = p->p_addr; kp->ki_args = p->p_args; kp->ki_tracep = p->p_tracep; kp->ki_textvp = p->p_textvp; kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; if (p->p_cred) { kp->ki_uid = p->p_cred->pc_ucred->cr_uid; kp->ki_ruid = p->p_cred->p_ruid; kp->ki_svuid = p->p_cred->p_svuid; kp->ki_ngroups = p->p_cred->pc_ucred->cr_ngroups; bcopy(p->p_cred->pc_ucred->cr_groups, kp->ki_groups, NGROUPS * sizeof(gid_t)); kp->ki_rgid = p->p_cred->p_rgid; kp->ki_svgid = p->p_cred->p_svgid; } if (p->p_procsig) { kp->ki_sigignore = p->p_procsig->ps_sigignore; kp->ki_sigcatch = p->p_procsig->ps_sigcatch; } mtx_lock_spin(&sched_lock); if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } if ((p->p_sflag & PS_INMEM) && p->p_stats) { kp->ki_start = p->p_stats->p_start; kp->ki_rusage = p->p_stats->p_ru; kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec + p->p_stats->p_cru.ru_stime.tv_sec; kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec + p->p_stats->p_cru.ru_stime.tv_usec; } if (p->p_wmesg) { strncpy(kp->ki_wmesg, p->p_wmesg, WMESGLEN); kp->ki_wmesg[WMESGLEN] = 0; } if (p->p_stat == SMTX) { kp->ki_kiflag |= KI_MTXBLOCK; strncpy(kp->ki_mtxname, p->p_mtxname, MTXNAMELEN); kp->ki_mtxname[MTXNAMELEN] = 0; } kp->ki_stat = p->p_stat; kp->ki_sflag = p->p_sflag; kp->ki_pctcpu = p->p_pctcpu; kp->ki_estcpu = p->p_estcpu; kp->ki_slptime = p->p_slptime; kp->ki_swtime = p->p_swtime; kp->ki_wchan = p->p_wchan; kp->ki_traceflag = p->p_traceflag; - kp->ki_priority = p->p_priority; - kp->ki_usrpri = p->p_usrpri; - kp->ki_nativepri = p->p_nativepri; + kp->ki_pri = p->p_pri; kp->ki_nice = p->p_nice; - kp->ki_rtprio = p->p_rtprio; kp->ki_runtime = p->p_runtime; kp->ki_pid = p->p_pid; kp->ki_rqindex = p->p_rqindex; kp->ki_oncpu = p->p_oncpu; kp->ki_lastcpu = p->p_lastcpu; mtx_unlock_spin(&sched_lock); sp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; bcopy(sp->s_login, kp->ki_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag = KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; } } if ((p->p_flag & P_CONTROLT) && sp && ((tp = sp->s_ttyp) != NULL)) { kp->ki_tdev = dev2udev(tp->t_dev); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NOUDEV; if (p->p_comm[0] != 0) { strncpy(kp->ki_comm, p->p_comm, MAXCOMLEN); kp->ki_comm[MAXCOMLEN] = 0; } kp->ki_siglist = p->p_siglist; kp->ki_sigmask = p->p_sigmask; kp->ki_xstat = p->p_xstat; kp->ki_acflag = p->p_acflag; kp->ki_flag = p->p_flag; kp->ki_lock = p->p_lock; PROC_UNLOCK(p); PROCTREE_LOCK(PT_SHARED); if (p->p_pptr) kp->ki_ppid = p->p_pptr->p_pid; PROCTREE_LOCK(PT_RELEASE); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &zombproc, p_list) if (p->p_pid == pid) break; ALLPROC_LOCK(AP_RELEASE); return (p); } static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb) { struct kinfo_proc kinfo_proc; int error; pid_t pid = p->p_pid; fill_kinfo_proc(p, &kinfo_proc); error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc)); if (error) return (error); if (!doingzomb && pid && (pfind(pid) != p)) return EAGAIN; if (doingzomb && zpfind(pid) != p) return EAGAIN; return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct proc *p; int doingzomb; int error = 0; if (oidp->oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (0); if (p_can(curproc, p, P_CAN_SEE, NULL)) return (0); error = sysctl_out_proc(p, req, 0); return (error); } if (oidp->oid_number == KERN_PROC_ALL && !namelen) ; else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1) ; else return (EINVAL); if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } ALLPROC_LOCK(AP_SHARED); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Show a user only appropriate processes. */ if (p_can(curproc, p, P_CAN_SEE, NULL)) continue; /* * Skip embryonic processes. */ if (p->p_stat == SIDL) continue; /* * TODO - make more efficient (see notes below). * do by session. */ switch (oidp->oid_number) { case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) continue; break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL || p->p_session->s_ttyp == NULL || dev2udev(p->p_session->s_ttyp->t_dev) != (udev_t)name[0]) continue; break; case KERN_PROC_UID: if (p->p_ucred == NULL || p->p_ucred->cr_uid != (uid_t)name[0]) continue; break; case KERN_PROC_RUID: if (p->p_ucred == NULL || p->p_cred->p_ruid != (uid_t)name[0]) continue; break; } if (p_can(curproc, p, P_CAN_SEE, NULL)) continue; error = sysctl_out_proc(p, req, doingzomb); if (error) { ALLPROC_LOCK(AP_RELEASE); return (error); } } } ALLPROC_LOCK(AP_RELEASE); return (0); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int*) arg1; u_int namelen = arg2; struct proc *p; struct pargs *pa; int error = 0; if (namelen != 1) return (EINVAL); p = pfind((pid_t)name[0]); if (!p) return (0); if ((!ps_argsopen) && p_can(curproc, p, P_CAN_SEE, NULL)) return (0); if (req->newptr && curproc != p) return (EPERM); if (req->oldptr && p->p_args != NULL) error = SYSCTL_OUT(req, p->p_args->ar_args, p->p_args->ar_length); if (req->newptr == NULL) return (error); if (p->p_args && --p->p_args->ar_ref == 0) FREE(p->p_args, M_PARGS); p->p_args = NULL; if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (error); MALLOC(pa, struct pargs *, sizeof(struct pargs) + req->newlen, M_PARGS, M_WAITOK); pa->ar_ref = 1; pa->ar_length = req->newlen; error = SYSCTL_IN(req, pa->ar_args, req->newlen); if (!error) p->p_args = pa; else FREE(pa, M_PARGS); return (error); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, sysctl_kern_proc, "Process table"); SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY, sysctl_kern_proc_args, "Process argument list"); Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 72375) +++ head/sys/kern/kern_resource.c (revision 72376) @@ -1,861 +1,895 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_rlimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int donice __P((struct proc *curp, struct proc *chgp, int n)); /* dosetrlimit non-static: Needed by SysVR4 emulator */ int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct mtx uihashtbl_mtx; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static struct uidinfo *uicreate __P((uid_t uid)); static struct uidinfo *uilookup __P((uid_t uid)); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int getpriority(curp, uap) struct proc *curp; register struct getpriority_args *uap; { register struct proc *p; register int low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) p = curp; else p = pfind(uap->who); if (p == 0) break; if (p_can(curp, p, P_CAN_SEE, NULL)) break; low = p->p_nice; break; case PRIO_PGRP: { register struct pgrp *pg; if (uap->who == 0) pg = curp->p_pgrp; else if ((pg = pgfind(uap->who)) == NULL) break; LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (!p_can(curp, p, P_CAN_SEE, NULL) && p->p_nice < low) low = p->p_nice; } break; } case PRIO_USER: if (uap->who == 0) uap->who = curp->p_ucred->cr_uid; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) if (!p_can(curp, p, P_CAN_SEE, NULL) && p->p_ucred->cr_uid == uap->who && p->p_nice < low) low = p->p_nice; ALLPROC_LOCK(AP_RELEASE); break; default: return (EINVAL); } if (low == PRIO_MAX + 1) return (ESRCH); curp->p_retval[0] = low; return (0); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif /* ARGSUSED */ int setpriority(curp, uap) struct proc *curp; register struct setpriority_args *uap; { register struct proc *p; int found = 0, error = 0; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) p = curp; else p = pfind(uap->who); if (p == 0) break; if (p_can(curp, p, P_CAN_SEE, NULL)) break; error = donice(curp, p, uap->prio); found++; break; case PRIO_PGRP: { register struct pgrp *pg; if (uap->who == 0) pg = curp->p_pgrp; else if ((pg = pgfind(uap->who)) == NULL) break; LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (!p_can(curp, p, P_CAN_SEE, NULL)) { error = donice(curp, p, uap->prio); found++; } } break; } case PRIO_USER: if (uap->who == 0) uap->who = curp->p_ucred->cr_uid; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) if (p->p_ucred->cr_uid == uap->who && !p_can(curp, p, P_CAN_SEE, NULL)) { error = donice(curp, p, uap->prio); found++; } ALLPROC_LOCK(AP_RELEASE); break; default: return (EINVAL); } if (found == 0) return (ESRCH); return (error); } static int donice(curp, chgp, n) register struct proc *curp, *chgp; register int n; { int error; if ((error = p_can(curp, chgp, P_CAN_SCHED, NULL))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < chgp->p_nice && suser(curp)) return (EACCES); chgp->p_nice = n; (void)resetpriority(chgp); return (0); } /* rtprio system call */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif /* * Set realtime priority */ /* ARGSUSED */ int rtprio(curp, uap) struct proc *curp; register struct rtprio_args *uap; { register struct proc *p; struct rtprio rtp; int error; error = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); if (error) return (error); if (uap->pid == 0) p = curp; else p = pfind(uap->pid); if (p == 0) return (ESRCH); switch (uap->function) { case RTP_LOOKUP: - return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio))); + pri_to_rtp(&p->p_pri, &rtp); + return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_can(curp, p, P_CAN_SCHED, NULL))) return (error); /* disallow setting rtprio in most cases if not superuser */ if (suser(curp) != 0) { /* can't set someone else's */ if (uap->pid) return (EPERM); /* can't set realtime priority */ /* * Realtime priority has to be restricted for reasons which should be * obvious. However, for idle priority, there is a potential for * system deadlock if an idleprio process gains a lock on a resource * that other processes need (and the idleprio process can't run * due to a CPU-bound normal process). Fix me! XXX */ #if 0 if (RTP_PRIO_IS_REALTIME(rtp.type)) #endif if (rtp.type != RTP_PRIO_NORMAL) return (EPERM); } - switch (rtp.type) { -#ifdef RTP_PRIO_FIFO - case RTP_PRIO_FIFO: -#endif - case RTP_PRIO_REALTIME: - case RTP_PRIO_NORMAL: - case RTP_PRIO_IDLE: - if (rtp.prio > RTP_PRIO_MAX) - return (EINVAL); - p->p_rtprio = rtp; + if (rtp_to_pri(&rtp, &p->p_pri) == 0) return (0); - default: - return (EINVAL); - } - + return (EINVAL); default: return (EINVAL); } +} + +int +rtp_to_pri(struct rtprio *rtp, struct priority *pri) +{ + + if (rtp->prio > RTP_PRIO_MAX) + return (-1); + switch (RTP_PRIO_BASE(rtp->type)) { + case RTP_PRIO_REALTIME: + pri->pri_level = PRI_MIN_REALTIME + rtp->prio; + break; + case RTP_PRIO_NORMAL: + pri->pri_level = PRI_MIN_TIMESHARE + rtp->prio; + break; + case RTP_PRIO_IDLE: + pri->pri_level = PRI_MIN_IDLE + rtp->prio; + break; + default: + return (-1); + } + pri->pri_class = rtp->type; + pri->pri_native = pri->pri_level; + pri->pri_user = pri->pri_level; + return (0); +} + +void +pri_to_rtp(struct priority *pri, struct rtprio *rtp) +{ + + switch (PRI_BASE(pri->pri_class)) { + case PRI_REALTIME: + rtp->prio = pri->pri_level - PRI_MIN_REALTIME; + break; + case PRI_TIMESHARE: + rtp->prio = pri->pri_level - PRI_MIN_TIMESHARE; + break; + case PRI_IDLE: + rtp->prio = pri->pri_level - PRI_MIN_IDLE; + break; + default: + break; + } + rtp->type = pri->pri_class; } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif /* ARGSUSED */ int osetrlimit(p, uap) struct proc *p; register struct osetrlimit_args *uap; { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; return (dosetrlimit(p, uap->which, &lim)); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif /* ARGSUSED */ int ogetrlimit(p, uap) struct proc *p; register struct ogetrlimit_args *uap; { struct orlimit olim; if (uap->which >= RLIM_NLIMITS) return (EINVAL); olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; if (olim.rlim_cur == -1) olim.rlim_cur = 0x7fffffff; olim.rlim_max = p->p_rlimit[uap->which].rlim_max; if (olim.rlim_max == -1) olim.rlim_max = 0x7fffffff; return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim))); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int setrlimit(p, uap) struct proc *p; register struct __setrlimit_args *uap; { struct rlimit alim; int error; if ((error = copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit)))) return (error); return (dosetrlimit(p, uap->which, &alim)); } int dosetrlimit(p, which, limp) struct proc *p; u_int which; struct rlimit *limp; { register struct rlimit *alimp; int error; if (which >= RLIM_NLIMITS) return (EINVAL); alimp = &p->p_rlimit[which]; /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = suser_xxx(0, p, PRISON_ROOT))) return (error); if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; if (p->p_limit->p_refcnt > 1 && (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { p->p_limit->p_refcnt--; p->p_limit = limcopy(p->p_limit); alimp = &p->p_rlimit[which]; } switch (which) { case RLIMIT_CPU: if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000) p->p_limit->p_cpulimit = RLIM_INFINITY; else p->p_limit->p_cpulimit = (rlim_t)1000000 * limp->rlim_cur; break; case RLIMIT_DATA: if (limp->rlim_cur > MAXDSIZ) limp->rlim_cur = MAXDSIZ; if (limp->rlim_max > MAXDSIZ) limp->rlim_max = MAXDSIZ; break; case RLIMIT_STACK: if (limp->rlim_cur > MAXSSIZ) limp->rlim_cur = MAXSSIZ; if (limp->rlim_max > MAXSSIZ) limp->rlim_max = MAXSSIZ; /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != alimp->rlim_cur) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > alimp->rlim_cur) { prot = VM_PROT_ALL; size = limp->rlim_cur - alimp->rlim_cur; addr = USRSTACK - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = alimp->rlim_cur - limp->rlim_cur; addr = USRSTACK - alimp->rlim_cur; } addr = trunc_page(addr); size = round_page(size); (void) vm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, prot, FALSE); } break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; break; } *alimp = *limp; return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int getrlimit(p, uap) struct proc *p; register struct __getrlimit_args *uap; { if (uap->which >= RLIM_NLIMITS) return (EINVAL); return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp, sizeof (struct rlimit))); } /* * Transform the running time and tick information in proc p into user, * system, and interrupt time usage. */ void calcru(p, up, sp, ip) struct proc *p; struct timeval *up; struct timeval *sp; struct timeval *ip; { /* {user, system, interrupt, total} {ticks, usec}; previous tu: */ u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu; int s; struct timeval tv; mtx_assert(&sched_lock, MA_OWNED); /* XXX: why spl-protect ? worst case is an off-by-one report */ s = splstatclock(); ut = p->p_uticks; st = p->p_sticks; it = p->p_iticks; splx(s); tt = ut + st + it; if (tt == 0) { st = 1; tt = 1; } tu = p->p_runtime; if (p == curproc) { /* * Adjust for the current time slice. This is actually fairly * important since the error here is on the order of a time * quantum, which is much greater than the sampling error. */ microuptime(&tv); if (timevalcmp(&tv, PCPU_PTR(switchtime), <)) printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec), tv.tv_sec, tv.tv_usec); else tu += (tv.tv_usec - PCPU_GET(switchtime.tv_usec)) + (tv.tv_sec - PCPU_GET(switchtime.tv_sec)) * (int64_t)1000000; } ptu = p->p_uu + p->p_su + p->p_iu; if (tu < ptu || (int64_t)tu < 0) { /* XXX no %qd in kernel. Truncate. */ printf("calcru: negative time of %ld usec for pid %d (%s)\n", (long)tu, p->p_pid, p->p_comm); tu = ptu; } /* Subdivide tu. */ uu = (tu * ut) / tt; su = (tu * st) / tt; iu = tu - uu - su; /* Enforce monotonicity. */ if (uu < p->p_uu || su < p->p_su || iu < p->p_iu) { if (uu < p->p_uu) uu = p->p_uu; else if (uu + p->p_su + p->p_iu > tu) uu = tu - p->p_su - p->p_iu; if (st == 0) su = p->p_su; else { su = ((tu - uu) * st) / (st + it); if (su < p->p_su) su = p->p_su; else if (uu + su + p->p_iu > tu) su = tu - uu - p->p_iu; } KASSERT(uu + su + p->p_iu <= tu, ("calcru: monotonisation botch 1")); iu = tu - uu - su; KASSERT(iu >= p->p_iu, ("calcru: monotonisation botch 2")); } p->p_uu = uu; p->p_su = su; p->p_iu = iu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; if (ip != NULL) { ip->tv_sec = iu / 1000000; ip->tv_usec = iu % 1000000; } } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif /* ARGSUSED */ int getrusage(p, uap) register struct proc *p; register struct getrusage_args *uap; { register struct rusage *rup; switch (uap->who) { case RUSAGE_SELF: rup = &p->p_stats->p_ru; mtx_lock_spin(&sched_lock); calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); mtx_unlock_spin(&sched_lock); break; case RUSAGE_CHILDREN: rup = &p->p_stats->p_cru; break; default: return (EINVAL); } return (copyout((caddr_t)rup, (caddr_t)uap->rusage, sizeof (struct rusage))); } void ruadd(ru, ru2) register struct rusage *ru, *ru2; { register long *ip, *ip2; register int i; timevaladd(&ru->ru_utime, &ru2->ru_utime); timevaladd(&ru->ru_stime, &ru2->ru_stime); if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork, * and copy when a limit is changed. */ struct plimit * limcopy(lim) struct plimit *lim; { register struct plimit *copy; MALLOC(copy, struct plimit *, sizeof(struct plimit), M_SUBPROC, M_WAITOK); bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit)); copy->p_lflags = 0; copy->p_refcnt = 1; return (copy); } /* * Find the uidinfo structure for a uid. This structure is used to * track the total resource consumption (process count, socket buffer * size, etc.) for the uid and impose limits. */ void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); mtx_init(&uihashtbl_mtx, "uidinfo hash", MTX_DEF); } /* * lookup a uidinfo struct for the parameter uid. * uihashtbl_mtx must be locked. */ static struct uidinfo * uilookup(uid) uid_t uid; { struct uihashhead *uipp; struct uidinfo *uip; mtx_assert(&uihashtbl_mtx, MA_OWNED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) break; return (uip); } /* * Create a uidinfo struct for the parameter uid. * uihashtbl_mtx must be locked. */ static struct uidinfo * uicreate(uid) uid_t uid; { struct uidinfo *uip; mtx_assert(&uihashtbl_mtx, MA_OWNED); MALLOC(uip, struct uidinfo *, sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); uip->ui_uid = uid; mtx_init(&uip->ui_mtx, "uidinfo struct", MTX_DEF); return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Increase refcount on uidinfo struct returned. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid) uid_t uid; { struct uidinfo *uip; mtx_lock(&uihashtbl_mtx); uip = uilookup(uid); if (uip == NULL) uip = uicreate(uid); uihold(uip); mtx_unlock(&uihashtbl_mtx); return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(uip) struct uidinfo *uip; { mtx_lock(&uip->ui_mtx); uip->ui_ref++; mtx_unlock(&uip->ui_mtx); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, loose the lock and aquire the locks in the proper * order to try again. */ void uifree(uip) struct uidinfo *uip; { /* Prepare for optimal case. */ mtx_lock(&uip->ui_mtx); if (--uip->ui_ref != 0) { mtx_unlock(&uip->ui_mtx); return; } /* Prepare for suboptimal case. */ uip->ui_ref++; mtx_unlock(&uip->ui_mtx); mtx_lock(&uihashtbl_mtx); mtx_lock(&uip->ui_mtx); /* * We must subtract one from the count again because we backed out * our initial subtraction before dropping the lock. * Since another thread may have added a reference after we dropped the * initial lock we have to test for zero again. */ if (--uip->ui_ref == 0) { LIST_REMOVE(uip, ui_hash); mtx_unlock(&uihashtbl_mtx); if (uip->ui_sbsize != 0) /* XXX no %qd in kernel. Truncate. */ printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, (long)uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); mtx_destroy(&uip->ui_mtx); FREE(uip, M_UIDINFO); return; } mtx_unlock(&uihashtbl_mtx); mtx_unlock(&uip->ui_mtx); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(uip, diff, max) struct uidinfo *uip; int diff; int max; { mtx_lock(&uip->ui_mtx); /* don't allow them to exceed max, but allow subtraction */ if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) { mtx_unlock(&uip->ui_mtx); return (0); } uip->ui_proccnt += diff; if (uip->ui_proccnt < 0) printf("negative proccnt for uid = %d\n", uip->ui_uid); mtx_unlock(&uip->ui_mtx); return (1); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(uip, hiwat, to, max) struct uidinfo *uip; u_long *hiwat; u_long to; rlim_t max; { rlim_t new; int s; s = splnet(); mtx_lock(&uip->ui_mtx); new = uip->ui_sbsize + to - *hiwat; /* don't allow them to exceed max, but allow subtraction */ if (to > *hiwat && new > max) { splx(s); mtx_unlock(&uip->ui_mtx); return (0); } uip->ui_sbsize = new; *hiwat = to; if (uip->ui_sbsize < 0) printf("negative sbsize for uid = %d\n", uip->ui_uid); splx(s); mtx_unlock(&uip->ui_mtx); return (1); } Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 72375) +++ head/sys/kern/kern_sig.c (revision 72376) @@ -1,1856 +1,1856 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ static int coredump __P((struct proc *)); static int do_sigaction __P((struct proc *p, int sig, struct sigaction *act, struct sigaction *oact, int old)); static int do_sigprocmask __P((struct proc *p, int how, sigset_t *set, sigset_t *oset, int old)); static char *expand_name __P((const char *, uid_t, pid_t)); static int killpg1 __P((struct proc *cp, int sig, int pgid, int all)); static int sig_ffs __P((sigset_t *set)); static int sigprop __P((int sig)); static void stop __P((struct proc *)); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); /* * Can process p, with pcred pc, send the signal sig to process q? */ #define CANSIGNAL(p, q, sig) \ (!p_can(p, q, P_CAN_KILL, NULL) || \ ((sig) == SIGCONT && (q)->p_session == (p)->p_session)) /* * Policy -- Can real uid ruid with ucred uc send a signal to process q? */ #define CANSIGIO(ruid, uc, q) \ ((uc)->cr_uid == 0 || \ (ruid) == (q)->p_cred->p_ruid || \ (uc)->cr_uid == (q)->p_cred->p_ruid || \ (ruid) == (q)->p_ucred->cr_uid || \ (uc)->cr_uid == (q)->p_ucred->cr_uid) int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { SA_KILL, /* SIGHUP */ SA_KILL, /* SIGINT */ SA_KILL|SA_CORE, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL, /* SIGPIPE */ SA_KILL, /* SIGALRM */ SA_KILL, /* SIGTERM */ SA_IGNORE, /* SIGURG */ SA_STOP, /* SIGSTOP */ SA_STOP|SA_TTYSTOP, /* SIGTSTP */ SA_IGNORE|SA_CONT, /* SIGCONT */ SA_IGNORE, /* SIGCHLD */ SA_STOP|SA_TTYSTOP, /* SIGTTIN */ SA_STOP|SA_TTYSTOP, /* SIGTTOU */ SA_IGNORE, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL, /* SIGVTALRM */ SA_KILL, /* SIGPROF */ SA_IGNORE, /* SIGWINCH */ SA_IGNORE, /* SIGINFO */ SA_KILL, /* SIGUSR1 */ SA_KILL, /* SIGUSR2 */ }; /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). * * MP SAFE. */ int CURSIG(struct proc *p) { sigset_t tmpset; int r; if (SIGISEMPTY(p->p_siglist)) return (0); tmpset = p->p_siglist; SIGSETNAND(tmpset, p->p_sigmask); if (SIGISEMPTY(tmpset) && (p->p_flag & P_TRACED) == 0) return (0); mtx_lock(&Giant); r = issignal(p); mtx_unlock(&Giant); return (r); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } static __inline int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } /* * do_sigaction * sigaction * osigaction */ static int do_sigaction(p, sig, act, oact, old) struct proc *p; register int sig; struct sigaction *act, *oact; int old; { register struct sigacts *ps = p->p_sigacts; if (sig <= 0 || sig > _SIG_MAXSIG) return (EINVAL); if (oact) { oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) oact->sa_flags |= SA_SIGINFO; if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) return (EINVAL); /* * Change setting atomically. */ (void) splhigh(); ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (act->sa_flags & SA_SIGINFO) { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGDELSET(ps->ps_siginfo, sig); } if (!(act->sa_flags & SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (act->sa_flags & SA_ONSTACK) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (act->sa_flags & SA_RESETHAND) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (act->sa_flags & SA_NODEFER) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); #ifdef COMPAT_SUNOS if (act->sa_flags & SA_USERTRAMP) SIGADDSET(ps->ps_usertramp, sig); else SIGDELSET(ps->ps_usertramp, seg); #endif if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) p->p_procsig->ps_flag |= PS_NOCLDSTOP; else p->p_procsig->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) p->p_procsig->ps_flag &= ~PS_NOCLDWAIT; else p->p_procsig->ps_flag |= PS_NOCLDWAIT; } else p->p_procsig->ps_flag &= ~PS_NOCLDWAIT; } /* * Set bit in p_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in p_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ SIGDELSET(p->p_siglist, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(p->p_sigignore, sig); SIGDELSET(p->p_sigcatch, sig); } else { SIGDELSET(p->p_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(p->p_sigcatch, sig); else SIGADDSET(p->p_sigcatch, sig); } if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || !old) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); (void) spl0(); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif /* ARGSUSED */ int sigaction(p, uap) struct proc *p; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = do_sigaction(p, uap->sig, actp, oactp, 0); if (oactp && !error) { error = copyout(oactp, uap->oact, sizeof(oact)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif /* ARGSUSED */ int osigaction(p, uap) struct proc *p; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = do_sigaction(p, uap->signum, nsap, osap, 1); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; for (i = 1; i <= NSIG; i++) if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(p->p_sigignore, i); } /* * Reset signals for an exec of the specified process. */ void execsigs(p) register struct proc *p; { register struct sigacts *ps = p->p_sigacts; register int sig; /* * Reset caught signals. Held signals remain held * through p_sigmask (unless they were caught, * and are now ignored by default). */ while (SIGNOTEMPTY(p->p_sigcatch)) { sig = sig_ffs(&p->p_sigcatch); SIGDELSET(p->p_sigcatch, sig); if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(p->p_sigignore, sig); SIGDELSET(p->p_siglist, sig); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ p->p_sigstk.ss_flags = SS_DISABLE; p->p_sigstk.ss_size = 0; p->p_sigstk.ss_sp = 0; /* * Reset no zombies if child dies flag as Solaris does. */ p->p_procsig->ps_flag &= ~PS_NOCLDWAIT; } /* * do_sigprocmask() - MP SAFE ONLY IF p == curproc * * Manipulate signal mask. This routine is MP SAFE *ONLY* if * p == curproc. Also remember that in order to remain MP SAFE * no spl*() calls may be made. */ static int do_sigprocmask(p, how, set, oset, old) struct proc *p; int how; sigset_t *set, *oset; int old; { int error; if (oset != NULL) *oset = p->p_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); SIGSETOR(p->p_sigmask, *set); break; case SIG_UNBLOCK: SIGSETNAND(p->p_sigmask, *set); break; case SIG_SETMASK: SIG_CANTMASK(*set); if (old) SIGSETLO(p->p_sigmask, *set); else p->p_sigmask = *set; break; default: error = EINVAL; break; } } return (error); } /* * sigprocmask() - MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sigprocmask(p, uap) register struct proc *p; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = do_sigprocmask(p, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * osigprocmask() - MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(p, uap) register struct proc *p; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = do_sigprocmask(p, uap->how, &set, &oset, 1); SIG2OSIG(oset, p->p_retval[0]); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif /* ARGSUSED */ int sigpending(p, uap) struct proc *p; struct sigpending_args *uap; { return (copyout(&p->p_siglist, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif /* ARGSUSED */ int osigpending(p, uap) struct proc *p; struct osigpending_args *uap; { SIG2OSIG(p->p_siglist, p->p_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(p, uap) struct proc *p; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ #ifdef COMPAT_SUNOS nsap->sa_flags |= SA_USERTRAMP; #endif } error = do_sigaction(p, uap->signum, nsap, osap, 1); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; #ifdef COMPAT_SUNOS vec.sv_flags &= ~SA_NOCLDSTOP; #endif error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(p, uap) register struct proc *p; struct osigblock_args *uap; { sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); (void) splhigh(); SIG2OSIG(p->p_sigmask, p->p_retval[0]); SIGSETOR(p->p_sigmask, set); (void) spl0(); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(p, uap) struct proc *p; struct osigsetmask_args *uap; { sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); (void) splhigh(); SIG2OSIG(p->p_sigmask, p->p_retval[0]); SIGSETLO(p->p_sigmask, set); (void) spl0(); return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Suspend process until signal, providing mask to be set * in the meantime. Note nonstandard calling convention: * libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sigsuspend(p, uap) register struct proc *p; struct sigsuspend_args *uap; { sigset_t mask; register struct sigacts *ps = p->p_sigacts; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ p->p_oldsigmask = p->p_sigmask; p->p_flag |= P_OLDMASK; SIG_CANTMASK(mask); p->p_sigmask = mask; while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; /* always return EINTR rather than ERESTART... */ return (EINTR); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(p, uap) register struct proc *p; struct osigsuspend_args *uap; { sigset_t mask; register struct sigacts *ps = p->p_sigacts; p->p_oldsigmask = p->p_sigmask; p->p_flag |= P_OLDMASK; OSIG2SIG(uap->mask, mask); SIG_CANTMASK(mask); SIGSETLO(p->p_sigmask, mask); while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "opause", 0) == 0) /* void */; /* always return EINTR rather than ERESTART... */ return (EINTR); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(p, uap) struct proc *p; register struct osigstack_args *uap; { struct sigstack ss; int error; if (uap->oss != NULL) { ss.ss_sp = p->p_sigstk.ss_sp; ss.ss_onstack = sigonstack(cpu_getstack(p)); error = copyout(&ss, uap->oss, sizeof(struct sigstack)); if (error) return (error); } if (uap->nss != NULL) { if ((error = copyin(uap->nss, &ss, sizeof(ss))) != 0) return (error); p->p_sigstk.ss_sp = ss.ss_sp; p->p_sigstk.ss_size = 0; p->p_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK; p->p_flag |= P_ALTSTACK; } return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sigaltstack(p, uap) struct proc *p; register struct sigaltstack_args *uap; { stack_t ss; int error, oonstack; oonstack = sigonstack(cpu_getstack(p)); if (uap->oss != NULL) { ss = p->p_sigstk; ss.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; if ((error = copyout(&ss, uap->oss, sizeof(stack_t))) != 0) return (error); } if (uap->ss != NULL) { if (oonstack) return (EPERM); if ((error = copyin(uap->ss, &ss, sizeof(ss))) != 0) return (error); if ((ss.ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss.ss_flags & SS_DISABLE)) { if (ss.ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); p->p_sigstk = ss; p->p_flag |= P_ALTSTACK; } else p->p_flag &= ~P_ALTSTACK; } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ int killpg1(cp, sig, pgid, all) register struct proc *cp; int sig, pgid, all; { register struct proc *p; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == cp || !CANSIGNAL(cp, p, sig)) continue; nfound++; if (sig) psignal(p, sig); } ALLPROC_LOCK(AP_RELEASE); } else { if (pgid == 0) /* * zero pgid means send to my process group. */ pgrp = cp->p_pgrp; else { pgrp = pgfind(pgid); if (pgrp == NULL) return (ESRCH); } LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_stat == SZOMB || !CANSIGNAL(cp, p, sig)) continue; nfound++; if (sig) psignal(p, sig); } } return (nfound ? 0 : ESRCH); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int kill(cp, uap) register struct proc *cp; register struct kill_args *uap; { register struct proc *p; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) return (ESRCH); if (!CANSIGNAL(cp, p, uap->signum)) return (EPERM); if (uap->signum) psignal(p, uap->signum); return (0); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(cp, uap->signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(cp, uap->signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(cp, uap->signum, -uap->pid, 0)); } /* NOTREACHED */ } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(p, uap) struct proc *p; register struct okillpg_args *uap; { if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); return (killpg1(p, uap->signum, uap->pgid, 0)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Send a signal to a process group. */ void gsignal(pgid, sig) int pgid, sig; { struct pgrp *pgrp; if (pgid && (pgrp = pgfind(pgid))) pgsignal(pgrp, sig, 0); } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(pgrp, sig, checkctty) struct pgrp *pgrp; int sig, checkctty; { register struct proc *p; if (pgrp) LIST_FOREACH(p, &pgrp->pg_members, p_pglist) if (checkctty == 0 || p->p_flag & P_CONTROLT) psignal(p, sig); } /* * Send a signal caused by a trap to the current process. * If it will be caught immediately, deliver it with correct code. * Otherwise, post it normally. */ void trapsignal(p, sig, code) struct proc *p; register int sig; u_long code; { register struct sigacts *ps = p->p_sigacts; if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(p->p_sigcatch, sig) && !SIGISMEMBER(p->p_sigmask, sig)) { p->p_stats->p_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(p, KTR_PSIG)) ktrpsig(p->p_tracep, sig, ps->ps_sigact[_SIG_IDX(sig)], &p->p_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig, &p->p_sigmask, code); SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(p->p_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See do_sigaction() for origin of this code. */ SIGDELSET(p->p_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(p->p_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } } else { p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ psignal(p, sig); } } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. */ void psignal(p, sig) register struct proc *p; register int sig; { register int prop; register sig_t action; if (sig > _SIG_MAXSIG || sig <= 0) { printf("psignal: signal %d\n", sig); panic("psignal signal number"); } PROC_LOCK(p); KNOTE(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); /* * If proc is traced, always give parent a chance; * if signal event is tracked by procfs, give *that* * a chance, as well. */ if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) action = SIG_DFL; else { /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in p_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ if (SIGISMEMBER(p->p_sigignore, sig) || (p->p_flag & P_WEXIT)) { PROC_UNLOCK(p); return; } if (SIGISMEMBER(p->p_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(p->p_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; } mtx_lock_spin(&sched_lock); if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) && (p->p_flag & P_TRACED) == 0) p->p_nice = NZERO; mtx_unlock_spin(&sched_lock); if (prop & SA_CONT) SIG_STOPSIGMASK(p->p_siglist); if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && action == SIG_DFL) { PROC_UNLOCK(p); return; } SIG_CONTSIGMASK(p->p_siglist); } SIGADDSET(p->p_siglist, sig); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ mtx_lock_spin(&sched_lock); if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return; } switch (p->p_stat) { case SSLEEP: /* * If process is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((p->p_sflag & PS_SINTR) == 0) { mtx_unlock_spin(&sched_lock); goto out; } /* * Process is sleeping and traced... make it runnable * so it can discover the signal in issignal() and stop * for the parent. */ if (p->p_flag & P_TRACED) goto run; mtx_unlock_spin(&sched_lock); /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { SIGDELSET(p->p_siglist, sig); goto out; } /* * When a sleeping process receives a stop * signal, process immediately if possible. * All other (caught or default) signals * cause the process to run. */ if (prop & SA_STOP) { if (action != SIG_DFL) goto runfast; /* * If a child holding parent blocked, * stopping could cause deadlock. */ if (p->p_flag & P_PPWAIT) goto out; SIGDELSET(p->p_siglist, sig); p->p_xstat = sig; PROC_UNLOCK(p); PROCTREE_LOCK(PT_SHARED); if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0) psignal(p->p_pptr, SIGCHLD); stop(p); PROCTREE_LOCK(PT_RELEASE); PROC_LOCK(p); goto out; } else goto runfast; /* NOTREACHED */ case SSTOP: mtx_unlock_spin(&sched_lock); /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Kill signal always sets processes running. */ if (sig == SIGKILL) goto runfast; if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in p_siglist, as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * p_siglist. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, then it goes back to run state. * Otherwise, process goes back to sleep state. */ if (action == SIG_DFL) SIGDELSET(p->p_siglist, sig); if (action == SIG_CATCH) goto runfast; mtx_lock_spin(&sched_lock); if (p->p_wchan == NULL) goto run; p->p_stat = SSLEEP; mtx_unlock_spin(&sched_lock); goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again. * (If we did the shell could get confused.) */ SIGDELSET(p->p_siglist, sig); goto out; } /* * If process is sleeping interruptibly, then simulate a * wakeup so that when it is continued, it will be made * runnable and can look at the signal. But don't make * the process runnable, leave it stopped. */ mtx_lock_spin(&sched_lock); if (p->p_wchan && p->p_sflag & PS_SINTR) { if (p->p_sflag & PS_CVWAITQ) cv_waitq_remove(p); else unsleep(p); } mtx_unlock_spin(&sched_lock); goto out; default: /* * SRUN, SIDL, SZOMB do nothing with the signal, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ if (p == curproc) { signotify(p); mtx_unlock_spin(&sched_lock); } #ifdef SMP else if (p->p_stat == SRUN) { mtx_unlock_spin(&sched_lock); forward_signal(p); } #endif else mtx_unlock_spin(&sched_lock); goto out; } /*NOTREACHED*/ runfast: /* * Raise priority to at least PUSER. */ mtx_lock_spin(&sched_lock); - if (p->p_priority > PUSER) - p->p_priority = PUSER; + if (p->p_pri.pri_level > PUSER) + p->p_pri.pri_level = PUSER; run: /* If we jump here, sched_lock has to be owned. */ mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); setrunnable(p); mtx_unlock_spin(&sched_lock); out: /* If we jump here, sched_lock should not be owned. */ mtx_assert(&sched_lock, MA_NOTOWNED); PROC_UNLOCK(p); } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in the CURSIG macro.) The normal call * sequence is * * while (sig = CURSIG(curproc)) * postsig(sig); */ int issignal(p) register struct proc *p; { sigset_t mask; register int sig, prop; for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); mask = p->p_siglist; SIGSETNAND(mask, p->p_sigmask); if (p->p_flag & P_PPWAIT) SIG_STOPSIGMASK(mask); if (!SIGNOTEMPTY(mask)) /* no signal to send */ return (0); sig = sig_ffs(&mask); prop = sigprop(sig); STOPEVENT(p, S_SIG, sig); /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(p->p_sigignore, sig) && (traced == 0)) { SIGDELSET(p->p_siglist, sig); continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { /* * If traced, always stop, and stay * stopped until released by the parent. */ p->p_xstat = sig; PROCTREE_LOCK(PT_SHARED); psignal(p->p_pptr, SIGCHLD); do { stop(p); PROCTREE_LOCK(PT_RELEASE); mtx_lock_spin(&sched_lock); DROP_GIANT_NOSWITCH(); mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROCTREE_LOCK(PT_SHARED); } while (!trace_req(p) && p->p_flag & P_TRACED); PROCTREE_LOCK(PT_RELEASE); /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and ps_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ SIGDELSET(p->p_siglist, sig); /* clear old signal */ sig = p->p_xstat; if (sig == 0) continue; /* * Put the new signal into p_siglist. If the * signal is being masked, look for other signals. */ SIGADDSET(p->p_siglist, sig); if (SIGISMEMBER(p->p_sigmask, sig)) continue; } /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((int)(intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (int)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & P_TRACED || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ p->p_xstat = sig; PROCTREE_LOCK(PT_SHARED); stop(p); if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0) psignal(p->p_pptr, SIGCHLD); PROCTREE_LOCK(PT_RELEASE); mtx_lock_spin(&sched_lock); DROP_GIANT_NOSWITCH(); mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (int)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } SIGDELSET(p->p_siglist, sig); /* take the signal! */ } /* NOTREACHED */ } /* * Put the argument process into the stopped state and notify the parent * via wakeup. Signals are handled elsewhere. The process must not be * on the run queue. Must be called with at least a shared hold of the * proctree lock. */ void stop(p) register struct proc *p; { PROCTREE_ASSERT(PT_SHARED); mtx_lock_spin(&sched_lock); p->p_stat = SSTOP; p->p_flag &= ~P_WAITED; wakeup((caddr_t)p->p_pptr); mtx_unlock_spin(&sched_lock); } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(sig) register int sig; { register struct proc *p = curproc; struct sigacts *ps = p->p_sigacts; sig_t action; sigset_t returnmask; int code; KASSERT(sig != 0, ("postsig")); SIGDELSET(p->p_siglist, sig); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(p, KTR_PSIG)) ktrpsig(p->p_tracep, sig, action, p->p_flag & P_OLDMASK ? &p->p_oldsigmask : &p->p_sigmask, 0); #endif STOPEVENT(p, S_SIG, sig); if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ sigexit(p, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(p->p_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ (void) splhigh(); if (p->p_flag & P_OLDMASK) { returnmask = p->p_oldsigmask; p->p_flag &= ~P_OLDMASK; } else returnmask = p->p_sigmask; SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(p->p_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See do_sigaction() for origin of this code. */ SIGDELSET(p->p_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(p->p_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } (void) spl0(); p->p_stats->p_ru.ru_nsignals++; if (p->p_sig != sig) { code = 0; } else { code = p->p_code; p->p_code = 0; p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code); } } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(p, sig) register struct proc *p; int sig; { p->p_acflag |= AXSIG; if (sigprop(sig) & SA_CORE) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too */ if (coredump(p) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } exit1(p, W_EXITCODE(0, sig)); /* NOTREACHED */ } static char corefilename[MAXPATHLEN+1] = {"%N.core"}; SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, sizeof(corefilename), "process corefile name format string"); /* * expand_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static char * expand_name(name, uid, pid) const char *name; uid_t uid; pid_t pid; { char *temp; char buf[11]; /* Buffer for pid/uid -- max 4B */ int i, n; char *format = corefilename; size_t namelen; temp = malloc(MAXPATHLEN + 1, M_TEMP, M_NOWAIT); if (temp == NULL) return NULL; namelen = strlen(name); for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { int l; switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': temp[n++] = '%'; break; case 'N': /* process name */ if ((n + namelen) > MAXPATHLEN) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", pid, name, uid, temp, name); free(temp, M_TEMP); return NULL; } memcpy(temp+n, name, namelen); n += namelen; break; case 'P': /* process id */ l = sprintf(buf, "%u", pid); if ((n + l) > MAXPATHLEN) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", pid, name, uid, temp, name); free(temp, M_TEMP); return NULL; } memcpy(temp+n, buf, l); n += l; break; case 'U': /* user id */ l = sprintf(buf, "%u", uid); if ((n + l) > MAXPATHLEN) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", pid, name, uid, temp, name); free(temp, M_TEMP); return NULL; } memcpy(temp+n, buf, l); n += l; break; default: log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); } break; default: temp[n++] = format[i]; } } temp[n] = '\0'; return temp; } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(p) register struct proc *p; { register struct vnode *vp; register struct ucred *cred = p->p_ucred; struct nameidata nd; struct vattr vattr; int error, error1, flags; struct mount *mp; char *name; /* name of corefile */ off_t limit; STOPEVENT(p, S_CORE, 0); if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) return (EFAULT); /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = p->p_rlimit[RLIMIT_CORE].rlim_cur; if (limit == 0) return 0; restart: name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, p); flags = O_CREAT | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR); free(name, M_TEMP); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0, p); if ((error = vn_close(vp, FWRITE, cred, p)) != 0) return (error); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { error = EFAULT; goto out; } VATTR_NULL(&vattr); vattr.va_size = 0; VOP_LEASE(vp, p, cred, LEASE_WRITE); VOP_SETATTR(vp, &vattr, cred, p); p->p_acflag |= ACORE; error = p->p_sysent->sv_coredump ? p->p_sysent->sv_coredump(p, vp, limit) : ENOSYS; out: VOP_UNLOCK(vp, 0, p); vn_finished_write(mp); error1 = vn_close(vp, FWRITE, cred, p); if (error == 0) error = error1; return (error); } /* * Nonexistent system call-- signal process (may want to handle it). * Flag error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(p, args) struct proc *p; struct nosys_args *args; { psignal(p, SIGSYS); return (EINVAL); } /* * Send a signal to a SIGIO or SIGURG to a process or process group using * stored credentials rather than those of the current process. */ void pgsigio(sigio, sig, checkctty) struct sigio *sigio; int sig, checkctty; { if (sigio == NULL) return; if (sigio->sio_pgid > 0) { if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, sigio->sio_proc)) psignal(sigio->sio_proc, sig); } else if (sigio->sio_pgid < 0) { struct proc *p; LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, p) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) psignal(p, sig); } } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* XXX lock the proc here while adding to the list? */ SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } Index: head/sys/kern/kern_subr.c =================================================================== --- head/sys/kern/kern_subr.c (revision 72375) +++ head/sys/kern/kern_subr.c (revision 72376) @@ -1,389 +1,389 @@ /* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include static void uio_yield __P((void)); int uiomove(cp, n, uio) register caddr_t cp; register int n; register struct uio *uio; { register struct iovec *iov; u_int cnt; int error = 0; int save = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc, ("uiomove proc")); if (curproc) { save = curproc->p_flag & P_DEADLKTREAT; curproc->p_flag |= P_DEADLKTREAT; } while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE: case UIO_USERISPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) break; break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy((caddr_t)cp, iov->iov_base, cnt); else bcopy(iov->iov_base, (caddr_t)cp, cnt); break; case UIO_NOCOPY: break; } iov->iov_base += cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp += cnt; n -= cnt; } if (curproc) curproc->p_flag = (curproc->p_flag & ~P_DEADLKTREAT) | save; return (error); } int uiomoveco(cp, n, uio, obj) caddr_t cp; int n; struct uio *uio; struct vm_object *obj; { struct iovec *iov; u_int cnt; int error; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomoveco: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc, ("uiomoveco proc")); while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE: case UIO_USERISPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); if (uio->uio_rw == UIO_READ) { #ifdef ENABLE_VFS_IOOPT if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && ((uio->uio_offset & PAGE_MASK) == 0) && ((((intptr_t) cp) & PAGE_MASK) == 0)) { error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, uio->uio_offset, cnt, (vm_offset_t) iov->iov_base, NULL); } else #endif { error = copyout(cp, iov->iov_base, cnt); } } else { error = copyin(iov->iov_base, cp, cnt); } if (error) return (error); break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy((caddr_t)cp, iov->iov_base, cnt); else bcopy(iov->iov_base, (caddr_t)cp, cnt); break; case UIO_NOCOPY: break; } iov->iov_base += cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp += cnt; n -= cnt; } return (0); } #ifdef ENABLE_VFS_IOOPT int uioread(n, uio, obj, nread) int n; struct uio *uio; struct vm_object *obj; int *nread; { int npagesmoved; struct iovec *iov; u_int cnt, tcnt; int error; *nread = 0; if (vfs_ioopt < 2) return 0; error = 0; while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; if ((uio->uio_segflg == UIO_USERSPACE) && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && ((uio->uio_offset & PAGE_MASK) == 0) ) { if (cnt < PAGE_SIZE) break; cnt &= ~PAGE_MASK; if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, uio->uio_offset, cnt, (vm_offset_t) iov->iov_base, &npagesmoved); if (npagesmoved == 0) break; tcnt = npagesmoved * PAGE_SIZE; cnt = tcnt; if (error) break; iov->iov_base += cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; *nread += cnt; n -= cnt; } else { break; } } return error; } #endif /* * Give next character to user as result of read. */ int ureadc(c, uio) register int c; register struct uio *uio; { register struct iovec *iov; again: if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) panic("ureadc"); iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iovcnt--; uio->uio_iov++; goto again; } switch (uio->uio_segflg) { case UIO_USERSPACE: if (subyte(iov->iov_base, c) < 0) return (EFAULT); break; case UIO_SYSSPACE: *iov->iov_base = c; break; case UIO_USERISPACE: if (suibyte(iov->iov_base, c) < 0) return (EFAULT); break; case UIO_NOCOPY: break; } iov->iov_base++; iov->iov_len--; uio->uio_resid--; uio->uio_offset++; return (0); } /* * General routine to allocate a hash table. */ void * hashinit(elements, type, hashmask) int elements; struct malloc_type *type; u_long *hashmask; { long hashsize; LIST_HEAD(generic, generic) *hashtbl; int i; if (elements <= 0) panic("hashinit: bad elements"); for (hashsize = 1; hashsize <= elements; hashsize <<= 1) continue; hashsize >>= 1; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); for (i = 0; i < hashsize; i++) LIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; #define NPRIMES (sizeof(primes) / sizeof(primes[0])) /* * General routine to allocate a prime number sized hash table. */ void * phashinit(elements, type, nentries) int elements; struct malloc_type *type; u_long *nentries; { long hashsize; LIST_HEAD(generic, generic) *hashtbl; int i; if (elements <= 0) panic("phashinit: bad elements"); for (i = 1, hashsize = primes[1]; hashsize <= elements;) { i++; if (i == NPRIMES) break; hashsize = primes[i]; } hashsize = primes[i - 1]; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); for (i = 0; i < hashsize; i++) LIST_INIT(&hashtbl[i]); *nentries = hashsize; return (hashtbl); } static void uio_yield() { struct proc *p; int s; p = curproc; s = splhigh(); mtx_lock_spin(&sched_lock); DROP_GIANT_NOSWITCH(); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); splx(s); } Index: head/sys/kern/kern_switch.c =================================================================== --- head/sys/kern/kern_switch.c (revision 72375) +++ head/sys/kern/kern_switch.c (revision 72376) @@ -1,256 +1,238 @@ /* * Copyright (c) 1999 Peter Wemm * All rights reserved. + * Copyright (c) 2001 Jake Burkholder + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include -#include #include /* - * We have NQS (32) run queues per scheduling class. For the normal - * class, there are 128 priorities scaled onto these 32 queues. New - * processes are added to the last entry in each queue, and processes - * are selected for running by taking them from the head and maintaining - * a simple FIFO arrangement. - * - * Interrupt, real time and idle priority processes have and explicit - * 0-31 priority which maps directly onto their class queue index. - * When a queue has something in it, the corresponding bit is set in - * the queuebits variable, allowing a single read to determine the - * state of all 32 queues and then a ffs() to find the first busy - * queue. - * - * XXX This needs fixing. First, we only have one idle process, so we - * hardly need 32 queues for it. Secondly, the number of classes - * makes things unwieldy. We should be able to merge them into a - * single 96 or 128 entry queue. + * Global run queue. */ -struct rq itqueues[NQS]; /* interrupt threads */ -struct rq rtqueues[NQS]; /* real time processes */ -struct rq queues[NQS]; /* time sharing processes */ -struct rq idqueues[NQS]; /* idle process */ -u_int32_t itqueuebits; -u_int32_t rtqueuebits; -u_int32_t queuebits; -u_int32_t idqueuebits; +static struct runq runq; +SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq) /* - * Initialize the run queues at boot time. + * Wrappers which implement old interface; act on global run queue. */ -static void -rqinit(void *dummy) + +struct proc * +chooseproc(void) { - int i; + return runq_choose(&runq); +} - for (i = 0; i < NQS; i++) { - TAILQ_INIT(&itqueues[i]); - TAILQ_INIT(&rtqueues[i]); - TAILQ_INIT(&queues[i]); - TAILQ_INIT(&idqueues[i]); - } +int +procrunnable(void) +{ + return runq_check(&runq); } -SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) -/* - * setrunqueue() examines a process priority and class and inserts it on - * the tail of it's appropriate run queue (based on class and priority). - * This sets the queue busy bit. - * The process must be runnable. - * This must be called at splhigh(). - */ void +remrunqueue(struct proc *p) +{ + runq_remove(&runq, p); +} + +void setrunqueue(struct proc *p) { - struct rq *q; - u_int8_t pri; + runq_add(&runq, p); +} - mtx_assert(&sched_lock, MA_OWNED); - KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \ - p->p_comm)); +/* + * Clear the status bit of the queue corresponding to priority level pri, + * indicating that it is empty. + */ +static __inline void +runq_clrbit(struct runq *rq, int pri) +{ + struct rqbits *rqb; - /* - * Decide which class we want to run. We now have four - * queues, and this is becoming ugly. We should be able to - * collapse the first three classes into a single contiguous - * queue. XXX FIXME. - */ - CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %lx", - p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock); - if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { /* interrupt thread */ - pri = p->p_rtprio.prio; - q = &itqueues[pri]; - itqueuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */ - p->p_rtprio.type == RTP_PRIO_FIFO) { - pri = p->p_rtprio.prio; - q = &rtqueues[pri]; - rtqueuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { /* time sharing */ - pri = p->p_priority >> 2; - q = &queues[pri]; - queuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { /* idle proc */ - pri = p->p_rtprio.prio; - q = &idqueues[pri]; - idqueuebits |= 1 << pri; - } else { - panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type); - } - p->p_rqindex = pri; /* remember the queue index */ - TAILQ_INSERT_TAIL(q, p, p_procq); + rqb = &rq->rq_status; + CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", + rqb->rqb_bits[RQB_WORD(pri)], + rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), + RQB_BIT(pri), RQB_WORD(pri)); + rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); } /* - * remrunqueue() removes a given process from the run queue that it is on, - * clearing the queue busy bit if it becomes empty. - * This must be called at splhigh(). + * Find the index of the first non-empty run queue. This is done by + * scanning the status bits, a set bit indicates a non-empty queue. */ +static __inline int +runq_findbit(struct runq *rq) +{ + struct rqbits *rqb; + int pri; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < RQB_LEN; i++) + if (rqb->rqb_bits[i]) { + pri = (RQB_FFS(rqb->rqb_bits[i]) - 1) + + (i << RQB_L2BPW); + CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", + rqb->rqb_bits[i], i, pri); + return (pri); + } + + return (-1); +} + +/* + * Set the status bit of the queue corresponding to priority level pri, + * indicating that it is non-empty. + */ +static __inline void +runq_setbit(struct runq *rq, int pri) +{ + struct rqbits *rqb; + + rqb = &rq->rq_status; + CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", + rqb->rqb_bits[RQB_WORD(pri)], + rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), + RQB_BIT(pri), RQB_WORD(pri)); + rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); +} + +/* + * Add the process to the queue specified by its priority, and set the + * corresponding status bit. + */ void -remrunqueue(struct proc *p) +runq_add(struct runq *rq, struct proc *p) { - struct rq *q; - u_int32_t *which; - u_int8_t pri; + struct rqhead *rqh; + int pri; - CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %lx", - p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock); mtx_assert(&sched_lock, MA_OWNED); - pri = p->p_rqindex; - if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { - q = &itqueues[pri]; - which = &itqueuebits; - } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || - p->p_rtprio.type == RTP_PRIO_FIFO) { - q = &rtqueues[pri]; - which = &rtqueuebits; - } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - q = &queues[pri]; - which = &queuebits; - } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { - q = &idqueues[pri]; - which = &idqueuebits; - } else { - panic("remrunqueue: invalid rtprio type"); - } - TAILQ_REMOVE(q, p, p_procq); - if (TAILQ_EMPTY(q)) { - KASSERT((*which & (1 << pri)) != 0, - ("remrunqueue: remove from empty queue")); - *which &= ~(1 << pri); - } + KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN", + p, p->p_comm)); + pri = p->p_pri.pri_level / RQ_PPQ; + p->p_rqindex = pri; + runq_setbit(rq, pri); + rqh = &rq->rq_queues[pri]; + CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p", + p, p->p_pri.pri_level, pri, rqh); + TAILQ_INSERT_TAIL(rqh, p, p_procq); } /* - * procrunnable() returns a boolean true (non-zero) value if there are - * any runnable processes. This is intended to be called from the idle - * loop to avoid the more expensive (and destructive) chooseproc(). - * - * MP SAFE. CALLED WITHOUT THE MP LOCK - * - * XXX I doubt this. It's possibly fail-safe, but there's obviously - * the case here where one of the bits words gets loaded, the - * processor gets preempted, and by the time it returns from this - * function, some other processor has picked the runnable process. - * What am I missing? (grog, 23 July 2000). + * Return true if there are runnable processes of any priority on the run + * queue, false otherwise. Has no side effects, does not modify the run + * queue structure. */ -u_int32_t -procrunnable(void) +int +runq_check(struct runq *rq) { - return (itqueuebits || rtqueuebits || queuebits || idqueuebits); + struct rqbits *rqb; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < RQB_LEN; i++) + if (rqb->rqb_bits[i]) { + CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", + rqb->rqb_bits[i], i); + return (1); + } + CTR0(KTR_RUNQ, "runq_check: empty"); + + return (0); } /* - * chooseproc() selects the next process to run. Ideally, cpu_switch() - * would have determined that there is a process available before calling - * this, but it is not a requirement. The selected process is removed - * from it's queue, and the queue busy bit is cleared if it becomes empty. - * This must be called at splhigh(). - * - * For SMP, trivial affinity is implemented by locating the first process - * on the queue that has a matching lastcpu id. Since normal priorities - * are mapped four priority levels per queue, this may allow the cpu to - * choose a slightly lower priority process in order to preserve the cpu - * caches. + * Find and remove the highest priority process from the run queue. + * If there are no runnable processes, the per-cpu idle process is + * returned. Will not return NULL under any circumstances. */ struct proc * -chooseproc(void) +runq_choose(struct runq *rq) { + struct rqhead *rqh; struct proc *p; - struct rq *q; - u_int32_t *which; - u_int32_t pri; -#ifdef SMP - u_char id; -#endif + int pri; mtx_assert(&sched_lock, MA_OWNED); - if (itqueuebits) { - pri = ffs(itqueuebits) - 1; - q = &itqueues[pri]; - which = &itqueuebits; - } else if (rtqueuebits) { - pri = ffs(rtqueuebits) - 1; - q = &rtqueues[pri]; - which = &rtqueuebits; - } else if (queuebits) { - pri = ffs(queuebits) - 1; - q = &queues[pri]; - which = &queuebits; - } else if (idqueuebits) { - pri = ffs(idqueuebits) - 1; - q = &idqueues[pri]; - which = &idqueuebits; - } else { - CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %lx", - (long)sched_lock.mtx_lock); - return PCPU_GET(idleproc); - } - p = TAILQ_FIRST(q); -#ifdef SMP - /* wander down the current run queue for this pri level for a match */ - id = PCPU_GET(cpuid); - while (p->p_lastcpu != id) { - p = TAILQ_NEXT(p, p_procq); - if (p == NULL) { - p = TAILQ_FIRST(q); - break; + if ((pri = runq_findbit(rq)) != -1) { + rqh = &rq->rq_queues[pri]; + p = TAILQ_FIRST(rqh); + CTR3(KTR_RUNQ, "runq_choose: pri=%d p=%p rqh=%p", pri, p, rqh); + TAILQ_REMOVE(rqh, p, p_procq); + if (TAILQ_EMPTY(rqh)) { + CTR0(KTR_RUNQ, "runq_choose: empty"); + runq_clrbit(rq, pri); } + return (p); } -#endif - CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %lx", - p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock); - KASSERT(p, ("chooseproc: no proc on busy queue")); - TAILQ_REMOVE(q, p, p_procq); - if (TAILQ_EMPTY(q)) - *which &= ~(1 << pri); - return p; + CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri); + + return (PCPU_GET(idleproc)); +} + +/* + * Initialize a run structure. + */ +void +runq_init(struct runq *rq) +{ + int i; + + for (i = 0; i < RQ_NQS; i++) + TAILQ_INIT(&rq->rq_queues[i]); +} + +/* + * Remove the process from the queue specified by its priority, and clear the + * corresponding status bit if the queue becomes empty. + */ +void +runq_remove(struct runq *rq, struct proc *p) +{ + struct rqhead *rqh; + int pri; + + mtx_assert(&sched_lock, MA_OWNED); + pri = p->p_rqindex; + rqh = &rq->rq_queues[pri]; + CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p", + p, p->p_pri.pri_level, pri, rqh); + KASSERT(p != NULL, ("runq_remove: no proc on busy queue")); + TAILQ_REMOVE(rqh, p, p_procq); + if (TAILQ_EMPTY(rqh)) { + CTR0(KTR_RUNQ, "runq_remove: empty"); + runq_clrbit(rq, pri); + } } Index: head/sys/kern/kern_synch.c =================================================================== --- head/sys/kern/kern_synch.c (revision 72375) +++ head/sys/kern/kern_synch.c (revision 72376) @@ -1,1110 +1,1068 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 * $FreeBSD$ */ #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include static void sched_setup __P((void *dummy)); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) -u_char curpriority; int hogticks; int lbolt; int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ static struct callout schedcpu_callout; static struct callout roundrobin_callout; -static int curpriority_cmp __P((struct proc *p)); static void endtsleep __P((void *)); static void roundrobin __P((void *arg)); static void schedcpu __P((void *arg)); static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val; new_val = sched_quantum * tick; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < tick) return (EINVAL); sched_quantum = new_val / tick; hogticks = 2 * sched_quantum; return (0); } SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", ""); -/*- - * Compare priorities. Return: - * <0: priority of p < current priority - * 0: priority of p == current priority - * >0: priority of p > current priority - * The priorities are the normal priorities or the normal realtime priorities - * if p is on the same scheduler as curproc. Otherwise the process on the - * more realtimeish scheduler has lowest priority. As usual, a higher - * priority really means a lower priority. - */ -static int -curpriority_cmp(p) - struct proc *p; -{ - int c_class, p_class; - - c_class = RTP_PRIO_BASE(curproc->p_rtprio.type); - p_class = RTP_PRIO_BASE(p->p_rtprio.type); - if (p_class != c_class) - return (p_class - c_class); - if (p_class == RTP_PRIO_NORMAL) - return (((int)p->p_priority - (int)curpriority) / PPQ); - return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio); -} - /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ void -maybe_resched(chk) - struct proc *chk; +maybe_resched(p) + struct proc *p; { - struct proc *p = curproc; /* XXX */ - /* - * XXX idle scheduler still broken because proccess stays on idle - * scheduler during waits (such as when getting FS locks). If a - * standard process becomes runaway cpu-bound, the system can lockup - * due to idle-scheduler processes in wakeup never getting any cpu. - */ - if (p == PCPU_GET(idleproc)) { -#if 0 + if (p->p_pri.pri_level < curproc->p_pri.pri_level) need_resched(); -#endif - } else if (chk == p) { - /* We may need to yield if our priority has been raised. */ - if (curpriority_cmp(chk) > 0) - need_resched(); - } else if (curpriority_cmp(chk) < 0) - need_resched(); } int roundrobin_interval(void) { return (sched_quantum); } /* * Force switch among equal priority processes every 100ms. */ /* ARGSUSED */ static void roundrobin(arg) void *arg; { mtx_lock_spin(&sched_lock); need_resched(); mtx_unlock_spin(&sched_lock); #ifdef SMP forward_roundrobin(); #endif callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); } /* * Constants for digital decay and forget: * 90% of (p_estcpu) usage in 5 * loadav time * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates p_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * p_estcpu *= decay; * will compute * p_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ static int fscale __unused = FSCALE; SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(arg) void *arg; { register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); register struct proc *p; register int realstathz, s; realstathz = stathz ? stathz : hz; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) { /* * Increment time in/out of memory and sleep time * (if sleeping). We ignore overflow; with 16-bit int's * (remember them?) overflow takes 45 days. if (p->p_stat == SWAIT) continue; */ mtx_lock_spin(&sched_lock); p->p_swtime++; if (p->p_stat == SSLEEP || p->p_stat == SSTOP) p->p_slptime++; p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; /* * If the process has slept the entire second, * stop recalculating its priority until it wakes up. */ if (p->p_slptime > 1) { mtx_unlock_spin(&sched_lock); continue; } /* * prevent state changes and protect run queue */ s = splhigh(); /* * p_pctcpu is only for ps. */ #if (FSHIFT >= CCPU_SHIFT) p->p_pctcpu += (realstathz == 100)? ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 100 * (((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else p->p_pctcpu += ((FSCALE - ccpu) * (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif p->p_cpticks = 0; p->p_estcpu = decay_cpu(loadfac, p->p_estcpu); resetpriority(p); - if (p->p_priority >= PUSER) { + if (p->p_pri.pri_level >= PUSER) { if ((p != curproc) && #ifdef SMP p->p_oncpu == 0xff && /* idle */ #endif p->p_stat == SRUN && (p->p_sflag & PS_INMEM) && - (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { + (p->p_pri.pri_level / RQ_PPQ) != + (p->p_pri.pri_user / RQ_PPQ)) { remrunqueue(p); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; setrunqueue(p); } else - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; } mtx_unlock_spin(&sched_lock); splx(s); } ALLPROC_LOCK(AP_RELEASE); vmmeter(); wakeup((caddr_t)&lbolt); callout_reset(&schedcpu_callout, hz, schedcpu, NULL); } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max p_estcpu of 255, sleeping for at * least six times the loadfactor will decay p_estcpu to zero. */ void updatepri(p) register struct proc *p; { register unsigned int newcpu = p->p_estcpu; register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); if (p->p_slptime > 5 * loadfac) p->p_estcpu = 0; else { p->p_slptime--; /* the first time was done in schedcpu */ while (newcpu && --p->p_slptime) newcpu = decay_cpu(loadfac, newcpu); p->p_estcpu = newcpu; } resetpriority(p); } /* * We're only looking at 7 bits of the address; everything is * aligned to 4, lots of things are aligned to greater powers * of 2. Shift right by 8, i.e. drop the bottom 256 worth. */ #define TABLESIZE 128 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) void sleepinit(void) { int i; sched_quantum = hz/10; hogticks = 2 * sched_quantum; for (i = 0; i < TABLESIZE; i++) TAILQ_INIT(&slpque[i]); } /* * General sleep call. Suspends the current process until a wakeup is * performed on the specified identifier. The process will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds * (0 means no timeout). If pri includes PCATCH flag, signals are checked * before and after sleeping, else signals are not checked. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal needs to be delivered, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The mutex argument is exited before the caller is suspended, and * entered before msleep returns. If priority includes the PDROP * flag the mutex is not entered before returning. */ int msleep(ident, mtx, priority, wmesg, timo) void *ident; struct mtx *mtx; int priority, timo; const char *wmesg; { struct proc *p = curproc; int s, sig, catch = priority & PCATCH; int rval = 0; WITNESS_SAVE_DECL(mtx); #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif WITNESS_SLEEP(0, mtx); mtx_lock_spin(&sched_lock); s = splhigh(); if (cold || panicstr) { /* * After a panic, or during autoconfiguration, * just give interrupts a chance, then just return; * don't run any other procs or panic below, * in case this is the idle process and already asleep. */ if (mtx != NULL && priority & PDROP) mtx_unlock_flags(mtx, MTX_NOSWITCH); mtx_unlock_spin(&sched_lock); splx(s); return (0); } DROP_GIANT_NOSWITCH(); if (mtx != NULL) { mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(mtx, mtx); mtx_unlock_flags(mtx, MTX_NOSWITCH); if (priority & PDROP) mtx = NULL; } KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep")); /* * Process may be sitting on a slpque if asleep() was called, remove * it before re-adding. */ if (p->p_wchan != NULL) unsleep(p); p->p_wchan = ident; p->p_wmesg = wmesg; p->p_slptime = 0; - p->p_priority = priority & PRIMASK; + p->p_pri.pri_level = priority & PRIMASK; CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq); if (timo) callout_reset(&p->p_slpcallout, timo, endtsleep, p); /* * We put ourselves on the sleep queue and start our timeout * before calling CURSIG, as we could stop there, and a wakeup * or a SIGCONT (or both) could occur while we were stopped. * A SIGCONT would cause us to be marked as SSLEEP * without resuming us, thus we must be ready for sleep * when CURSIG is called. If the wakeup happens while we're * stopped, p->p_wchan will be 0 upon return from CURSIG. */ if (catch) { CTR4(KTR_PROC, "msleep caught: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); p->p_sflag |= PS_SINTR; mtx_unlock_spin(&sched_lock); if ((sig = CURSIG(p))) { mtx_lock_spin(&sched_lock); if (p->p_wchan) unsleep(p); p->p_stat = SRUN; goto resume; } mtx_lock_spin(&sched_lock); if (p->p_wchan == NULL) { catch = 0; goto resume; } } else sig = 0; p->p_stat = SSLEEP; p->p_stats->p_ru.ru_nvcsw++; mi_switch(); CTR4(KTR_PROC, "msleep resume: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); resume: - curpriority = p->p_usrpri; splx(s); p->p_sflag &= ~PS_SINTR; if (p->p_sflag & PS_TIMEOUT) { p->p_sflag &= ~PS_TIMEOUT; if (sig == 0) { #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif rval = EWOULDBLOCK; mtx_unlock_spin(&sched_lock); goto out; } } else if (timo) callout_stop(&p->p_slpcallout); mtx_unlock_spin(&sched_lock); if (catch && (sig != 0 || (sig = CURSIG(p)))) { #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; goto out; } out: #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif PICKUP_GIANT(); if (mtx != NULL) { mtx_lock(mtx); WITNESS_RESTORE(mtx, mtx); } return (rval); } /* * asleep() - async sleep call. Place process on wait queue and return * immediately without blocking. The process stays runnable until mawait() * is called. If ident is NULL, remove process from wait queue if it is still * on one. * * Only the most recent sleep condition is effective when making successive * calls to asleep() or when calling msleep(). * * The timeout, if any, is not initiated until mawait() is called. The sleep * priority, signal, and timeout is specified in the asleep() call but may be * overriden in the mawait() call. * * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>> */ int asleep(void *ident, int priority, const char *wmesg, int timo) { struct proc *p = curproc; int s; /* * obtain sched_lock while manipulating sleep structures and slpque. * * Remove preexisting wait condition (if any) and place process * on appropriate slpque, but do not put process to sleep. */ s = splhigh(); mtx_lock_spin(&sched_lock); if (p->p_wchan != NULL) unsleep(p); if (ident) { p->p_wchan = ident; p->p_wmesg = wmesg; p->p_slptime = 0; p->p_asleep.as_priority = priority; p->p_asleep.as_timo = timo; TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq); } mtx_unlock_spin(&sched_lock); splx(s); return(0); } /* * mawait() - wait for async condition to occur. The process blocks until * wakeup() is called on the most recent asleep() address. If wakeup is called * prior to mawait(), mawait() winds up being a NOP. * * If mawait() is called more then once (without an intervening asleep() call), * mawait() is still effectively a NOP but it calls mi_switch() to give other * processes some cpu before returning. The process is left runnable. * * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>> */ int mawait(struct mtx *mtx, int priority, int timo) { struct proc *p = curproc; int rval = 0; int s; WITNESS_SAVE_DECL(mtx); WITNESS_SLEEP(0, mtx); mtx_lock_spin(&sched_lock); DROP_GIANT_NOSWITCH(); if (mtx != NULL) { mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(mtx, mtx); mtx_unlock_flags(mtx, MTX_NOSWITCH); if (priority & PDROP) mtx = NULL; } s = splhigh(); if (p->p_wchan != NULL) { int sig; int catch; /* * The call to mawait() can override defaults specified in * the original asleep(). */ if (priority < 0) priority = p->p_asleep.as_priority; if (timo < 0) timo = p->p_asleep.as_timo; /* * Install timeout */ if (timo) callout_reset(&p->p_slpcallout, timo, endtsleep, p); sig = 0; catch = priority & PCATCH; if (catch) { p->p_sflag |= PS_SINTR; mtx_unlock_spin(&sched_lock); if ((sig = CURSIG(p))) { mtx_lock_spin(&sched_lock); if (p->p_wchan) unsleep(p); p->p_stat = SRUN; goto resume; } mtx_lock_spin(&sched_lock); if (p->p_wchan == NULL) { catch = 0; goto resume; } } p->p_stat = SSLEEP; p->p_stats->p_ru.ru_nvcsw++; mi_switch(); resume: - curpriority = p->p_usrpri; splx(s); p->p_sflag &= ~PS_SINTR; if (p->p_sflag & PS_TIMEOUT) { p->p_sflag &= ~PS_TIMEOUT; if (sig == 0) { #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif rval = EWOULDBLOCK; mtx_unlock_spin(&sched_lock); goto out; } } else if (timo) callout_stop(&p->p_slpcallout); mtx_unlock_spin(&sched_lock); if (catch && (sig != 0 || (sig = CURSIG(p)))) { #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; goto out; } #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif } else { /* * If as_priority is 0, mawait() has been called without an * intervening asleep(). We are still effectively a NOP, * but we call mi_switch() for safety. */ if (p->p_asleep.as_priority == 0) { p->p_stats->p_ru.ru_nvcsw++; mi_switch(); } mtx_unlock_spin(&sched_lock); splx(s); } /* * clear p_asleep.as_priority as an indication that mawait() has been * called. If mawait() is called again without an intervening asleep(), * mawait() is still effectively a NOP but the above mi_switch() code * is triggered as a safety. */ p->p_asleep.as_priority = 0; out: PICKUP_GIANT(); if (mtx != NULL) { mtx_lock(mtx); WITNESS_RESTORE(mtx, mtx); } return (rval); } /* * Implement timeout for msleep or asleep()/mawait() * * If process hasn't been awakened (wchan non-zero), * set timeout flag and undo the sleep. If proc * is stopped, just unsleep so it will remain stopped. * MP-safe, called without the Giant mutex. */ static void endtsleep(arg) void *arg; { register struct proc *p; int s; p = (struct proc *)arg; CTR4(KTR_PROC, "endtsleep: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); s = splhigh(); mtx_lock_spin(&sched_lock); if (p->p_wchan) { if (p->p_stat == SSLEEP) setrunnable(p); else unsleep(p); p->p_sflag |= PS_TIMEOUT; } mtx_unlock_spin(&sched_lock); splx(s); } /* * Remove a process from its wait queue */ void unsleep(p) register struct proc *p; { int s; s = splhigh(); mtx_lock_spin(&sched_lock); if (p->p_wchan) { TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq); p->p_wchan = NULL; } mtx_unlock_spin(&sched_lock); splx(s); } /* * Make all processes sleeping on the specified identifier runnable. */ void wakeup(ident) register void *ident; { register struct slpquehead *qp; register struct proc *p; int s; s = splhigh(); mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; restart: TAILQ_FOREACH(p, qp, p_slpq) { if (p->p_wchan == ident) { TAILQ_REMOVE(qp, p, p_slpq); p->p_wchan = NULL; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ CTR4(KTR_PROC, "wakeup: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; p->p_stat = SRUN; if (p->p_sflag & PS_INMEM) { setrunqueue(p); maybe_resched(p); } else { p->p_sflag |= PS_SWAPINREQ; wakeup((caddr_t)&proc0); } /* END INLINE EXPANSION */ goto restart; } } } mtx_unlock_spin(&sched_lock); splx(s); } /* * Make a process sleeping on the specified identifier runnable. * May wake more than one process if a target process is currently * swapped out. */ void wakeup_one(ident) register void *ident; { register struct slpquehead *qp; register struct proc *p; int s; s = splhigh(); mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; TAILQ_FOREACH(p, qp, p_slpq) { if (p->p_wchan == ident) { TAILQ_REMOVE(qp, p, p_slpq); p->p_wchan = NULL; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ CTR4(KTR_PROC, "wakeup1: proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; p->p_stat = SRUN; if (p->p_sflag & PS_INMEM) { setrunqueue(p); maybe_resched(p); break; } else { p->p_sflag |= PS_SWAPINREQ; wakeup((caddr_t)&proc0); } /* END INLINE EXPANSION */ } } } mtx_unlock_spin(&sched_lock); splx(s); } /* * The machine independent parts of mi_switch(). * Must be called at splstatclock() or higher. */ void mi_switch() { struct timeval new_switchtime; register struct proc *p = curproc; /* XXX */ #if 0 register struct rlimit *rlim; #endif int x; /* * XXX this spl is almost unnecessary. It is partly to allow for * sloppy callers that don't do it (issignal() via CURSIG() is the * main offender). It is partly to work around a bug in the i386 * cpu_switch() (the ipl is not preserved). We ran for years * without it. I think there was only a interrupt latency problem. * The main caller, msleep(), does an splx() a couple of instructions * after calling here. The buggy caller, issignal(), usually calls * here at spl0() and sometimes returns at splhigh(). The process * then runs for a little too long at splhigh(). The ipl gets fixed * when the process returns to user mode (or earlier). * * It would probably be better to always call here at spl0(). Callers * are prepared to give up control to another process, so they must * be prepared to be interrupted. The clock stuff here may not * actually need splstatclock(). */ x = splstatclock(); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); /* * Compute the amount of time during which the current * process was running, and add that to its total so far. */ microuptime(&new_switchtime); if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) { #if 0 /* XXX: This doesn't play well with sched_lock right now. */ printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec), new_switchtime.tv_sec, new_switchtime.tv_usec); #endif new_switchtime = PCPU_GET(switchtime); } else { p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) + (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) * (int64_t)1000000; } #if 0 /* * Check if the process exceeds its cpu resource allocation. * If over max, kill it. * * XXX drop sched_lock, pickup Giant */ if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && p->p_runtime > p->p_limit->p_cpulimit) { rlim = &p->p_rlimit[RLIMIT_CPU]; if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) { mtx_unlock_spin(&sched_lock); killproc(p, "exceeded maximum CPU limit"); mtx_lock_spin(&sched_lock); } else { mtx_unlock_spin(&sched_lock); psignal(p, SIGXCPU); mtx_lock_spin(&sched_lock); if (rlim->rlim_cur < rlim->rlim_max) { /* XXX: we should make a private copy */ rlim->rlim_cur += 5; } } } #endif /* * Pick a new current process and record its start time. */ cnt.v_swtch++; PCPU_SET(switchtime, new_switchtime); CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); cpu_switch(); CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p", p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock); if (PCPU_GET(switchtime.tv_sec) == 0) microuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); splx(x); } /* * Change process state to be runnable, * placing it on the run queue if it is in memory, * and awakening the swapper if it isn't in memory. */ void setrunnable(p) register struct proc *p; { register int s; s = splhigh(); mtx_lock_spin(&sched_lock); switch (p->p_stat) { case 0: case SRUN: case SZOMB: case SWAIT: default: panic("setrunnable"); case SSTOP: case SSLEEP: /* e.g. when sending signals */ if (p->p_sflag & PS_CVWAITQ) cv_waitq_remove(p); else unsleep(p); break; case SIDL: break; } p->p_stat = SRUN; if (p->p_sflag & PS_INMEM) setrunqueue(p); splx(s); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; if ((p->p_sflag & PS_INMEM) == 0) { p->p_sflag |= PS_SWAPINREQ; wakeup((caddr_t)&proc0); } else maybe_resched(p); mtx_unlock_spin(&sched_lock); } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ void resetpriority(p) register struct proc *p; { register unsigned int newpriority; mtx_lock_spin(&sched_lock); - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + if (p->p_pri.pri_class == PRI_TIMESHARE) { newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (p->p_nice - PRIO_MIN); - newpriority = min(newpriority, MAXPRI); - p->p_usrpri = newpriority; + newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), + PRI_MAX_TIMESHARE); + p->p_pri.pri_user = newpriority; } maybe_resched(p); mtx_unlock_spin(&sched_lock); } /* ARGSUSED */ static void sched_setup(dummy) void *dummy; { callout_init(&schedcpu_callout, 1); callout_init(&roundrobin_callout, 0); /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); } /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage * estimator (p_estcpu) is increased here. resetpriority() will * compute a different priority each time p_estcpu increases by * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). The cpu usage estimator ramps up * quite quickly when the process is running (linearly), and decays * away exponentially, at a rate which is proportionally slower when * the system is busy. The basic principle is that the system will * 90% forget that the process used a lot of CPU time in 5 * loadav * seconds. This causes the system to favor processes which haven't * run much recently, and to round-robin among other processes. */ void schedclock(p) struct proc *p; { p->p_cpticks++; p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(p); - if (p->p_priority >= PUSER) - p->p_priority = p->p_usrpri; + if (p->p_pri.pri_level >= PUSER) + p->p_pri.pri_level = p->p_pri.pri_user; } } /* * General purpose yield system call */ int yield(struct proc *p, struct yield_args *uap) { int s; p->p_retval[0] = 0; s = splhigh(); mtx_lock_spin(&sched_lock); DROP_GIANT_NOSWITCH(); - p->p_priority = MAXPRI; + p->p_pri.pri_level = PRI_MAX_TIMESHARE; setrunqueue(p); p->p_stats->p_ru.ru_nvcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); splx(s); return (0); } Index: head/sys/kern/ksched.c =================================================================== --- head/sys/kern/ksched.c (revision 72375) +++ head/sys/kern/ksched.c (revision 72376) @@ -1,264 +1,269 @@ /* * Copyright (c) 1996, 1997 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* ksched: Soft real time scheduling based on "rtprio". */ #include #include #include #include #include /* For need_resched */ #include /* For need_resched */ #include /* ksched: Real-time extension to support POSIX priority scheduling. */ struct ksched { struct timespec rr_interval; }; int ksched_attach(struct ksched **p) { struct ksched *ksched= p31b_malloc(sizeof(*ksched)); ksched->rr_interval.tv_sec = 0; ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval(); *p = ksched; return 0; } int ksched_detach(struct ksched *p) { p31b_free(p); return 0; } /* * XXX About priorities * * POSIX 1003.1b requires that numerically higher priorities be of * higher priority. It also permits sched_setparam to be * implementation defined for SCHED_OTHER. I don't like * the notion of inverted priorites for normal processes when * you can use "setpriority" for that. * * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL. */ /* Macros to convert between the unix (lower numerically is higher priority) * and POSIX 1003.1b (higher numerically is higher priority) */ #define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P)) #define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P)) /* These improve readability a bit for me: */ #define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX) #define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN) static __inline int getscheduler(register_t *ret, struct ksched *ksched, struct proc *p) { + struct rtprio rtp; int e = 0; - switch (p->p_rtprio.type) + pri_to_rtp(&p->p_pri, &rtp); + switch (rtp.type) { case RTP_PRIO_FIFO: *ret = SCHED_FIFO; break; case RTP_PRIO_REALTIME: *ret = SCHED_RR; break; default: *ret = SCHED_OTHER; break; } return e; } int ksched_setparam(register_t *ret, struct ksched *ksched, struct proc *p, const struct sched_param *param) { register_t policy; int e; e = getscheduler(&policy, ksched, p); if (e == 0) { if (policy == SCHED_OTHER) e = EINVAL; else e = ksched_setscheduler(ret, ksched, p, policy, param); } return e; } int ksched_getparam(register_t *ret, struct ksched *ksched, struct proc *p, struct sched_param *param) { - if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) - param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio); + struct rtprio rtp; + pri_to_rtp(&p->p_pri, &rtp); + if (RTP_PRIO_IS_REALTIME(rtp.type)) + param->sched_priority = rtpprio_to_p4prio(rtp.prio); + return 0; } /* * XXX The priority and scheduler modifications should * be moved into published interfaces in kern/kern_sync. * * The permissions to modify process p were checked in "p31b_proc()". * */ int ksched_setscheduler(register_t *ret, struct ksched *ksched, struct proc *p, int policy, const struct sched_param *param) { int e = 0; struct rtprio rtp; switch(policy) { case SCHED_RR: case SCHED_FIFO: if (param->sched_priority >= P1B_PRIO_MIN && param->sched_priority <= P1B_PRIO_MAX) { rtp.prio = p4prio_to_rtpprio(param->sched_priority); rtp.type = (policy == SCHED_FIFO) ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; - p->p_rtprio = rtp; + rtp_to_pri(&rtp, &p->p_pri); need_resched(); } else e = EPERM; break; case SCHED_OTHER: { rtp.type = RTP_PRIO_NORMAL; rtp.prio = p4prio_to_rtpprio(param->sched_priority); - p->p_rtprio = rtp; + rtp_to_pri(&rtp, &p->p_pri); /* XXX Simply revert to whatever we had for last * normal scheduler priorities. * This puts a requirement * on the scheduling code: You must leave the * scheduling info alone. */ need_resched(); } break; } return e; } int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct proc *p) { return getscheduler(ret, ksched, p); } /* ksched_yield: Yield the CPU. */ int ksched_yield(register_t *ret, struct ksched *ksched) { need_resched(); return 0; } int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy) { int e = 0; switch (policy) { case SCHED_FIFO: case SCHED_RR: *ret = RTP_PRIO_MAX; break; case SCHED_OTHER: *ret = PRIO_MAX; break; default: e = EINVAL; } return e; } int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy) { int e = 0; switch (policy) { case SCHED_FIFO: case SCHED_RR: *ret = P1B_PRIO_MIN; break; case SCHED_OTHER: *ret = PRIO_MIN; break; default: e = EINVAL; } return e; } int ksched_rr_get_interval(register_t *ret, struct ksched *ksched, struct proc *p, struct timespec *timespec) { *timespec = ksched->rr_interval; return 0; } Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 72375) +++ head/sys/kern/subr_trap.c (revision 72376) @@ -1,1328 +1,1327 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD$ */ /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_isa.h" #include "opt_ktrace.h" #include "opt_npx.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #include #include #include int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall __P((struct trapframe frame)); extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); void dblfault_handler __P((void)); extern inthand_t IDTVEC(syscall); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "system forced exception", /* 7 T_ASTFLT */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif #ifdef DDB static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif void userret(p, frame, oticks) struct proc *p; struct trapframe *frame; u_quad_t oticks; { int sig; while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); - p->p_priority = p->p_usrpri; + p->p_pri.pri_level = p->p_pri.pri_user; if (resched_wanted()) { /* * Since we are curproc, clock will normally just change * our priority without moving us from one queue to another * (since the running process is not on a queue.) * If that happened after we setrunqueue ourselves but before we * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ clear_resched(); DROP_GIANT_NOSWITCH(); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); while ((sig = CURSIG(p)) != 0) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); postsig(sig); } mtx_lock_spin(&sched_lock); } /* * Charge system time if profiling. */ if (p->p_sflag & PS_PROFIL) { mtx_unlock_spin(&sched_lock); /* XXX - do we need Giant? */ if (!mtx_owned(&Giant)) mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, TRAPF_PC(frame), (u_int)(p->p_sticks - oticks) * psratio); } - curpriority = p->p_priority; mtx_unlock_spin(&sched_lock); } /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct proc *p = curproc; u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif atomic_add_int(&cnt.v_trap, 1); if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. XXX This is really bad if we trap * while holding a spin lock. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We should walk p_heldmtx here and see if any are * spin mutexes, and not do this if so. */ enable_intr(); } } eva = 0; #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif type = frame.tf_trapno; code = frame.tf_err; if ((ISPL(frame.tf_cs) == SEL_UPL) || ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = code; i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i == 0) goto user; break; } /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); i = trap_pfault(&frame, TRUE, eva); mtx_unlock(&Giant); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * f00f hack workaround has triggered, treat * as illegal instruction not page fault. */ frame.tf_trapno = T_PRIVINFLT; goto restart; } #endif if (i == -1) goto out; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX /* transparent fault (due to context switch "late") */ if (npxdna()) goto out; #endif if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } mtx_lock(&Giant); i = (*pmath_emulate)(&frame); mtx_unlock(&Giant); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; } } else { /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. */ eva = rcr2(); enable_intr(); mtx_lock(&Giant); (void) trap_pfault(&frame, FALSE, eva); mtx_unlock(&Giant); goto out; case T_DNA: #ifdef DEV_NPX /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) goto out; #endif break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { mtx_lock(&Giant); i = vm86_emulate((struct vm86frame *)&frame); mtx_unlock(&Giant); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)&frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (in_vm86call) break; if (p->p_intr_nesting_level != 0) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame.tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; mtx_lock(&Giant); psignal(p, SIGBUS); mtx_unlock(&Giant); goto out; } /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_eip == (int)doreti_iret) { frame.tf_eip = (int)doreti_iret_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_ds) { frame.tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_es) { frame.tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_fs) { frame.tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ /* XXX Giant */ if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB /* XXX Giant */ if (kdb_trap (type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } mtx_lock(&Giant); trap_fatal(&frame, eva); mtx_unlock(&Giant); goto out; } mtx_lock(&Giant); /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(p, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif mtx_unlock(&Giant); user: userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); out: return; } #ifdef notyet /* * This version doesn't allow a page fault to user space while * in the kernel. The rest of the kernel needs to be made "safe" * before this can be used. I think the only things remaining * to be made safe are the iBCS2 code and the process tracing/ * debugging code. */ static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; va = trunc_page(eva); if (va < VM_MIN_KERNEL_ADDRESS) { vm_offset_t v; vm_page_t mpte; if (p == NULL || (!usermode && va < VM_MAXUSER_ADDRESS && (p->p_intr_nesting_level != 0 || PCPU_GET(curpcb) == NULL || PCPU_GET(curpcb)->pcb_onfault == NULL))) { trap_fatal(frame, eva); return (-1); } /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; /* * Since we know that kernel virtual address addresses * always have pte pages mapped, we just have to fault * the page. */ rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } #endif int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct proc *p = curproc; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; #endif if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * Grow the stack if necessary */ /* grow_stack returns false only if va falls into * a growable stack region and the stack growth * fails. It returns true if va was not within * a growable stack region, or if the stack * growth succeeded. */ if (!grow_stack (p, va)) { rv = KERN_FAILURE; PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); goto nogo; } /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (p->p_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss, esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic(trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif panic("double fault"); } /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct proc *p; vm_offset_t va; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); p = curproc; vm = p->p_vmspace; PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); if (!grow_stack (p, va)) { PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); return (1); } /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); if (rv != KERN_SUCCESS) return 1; return (0); } /* * syscall - MP aware system call request C handler * * A system call is essentially treated as a trap except that the * MP lock is not held on entry or return. We are responsible for * obtaining the MP lock if necessary and for handling ASTs * (e.g. a task switch) prior to return. * * In general, only simple access and manipulation of curproc and * the current stack is allowed without having to hold MP lock. */ void syscall(frame) struct trapframe frame; { caddr_t params; int i; struct sysent *callp; struct proc *p = curproc; u_quad_t sticks; int error; int narg; int args[8]; u_int code; atomic_add_int(&cnt.v_syscall, 1); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); panic("syscall"); /* NOT REACHED */ } #endif mtx_lock_spin(&sched_lock); sticks = p->p_sticks; mtx_unlock_spin(&sched_lock); p->p_md.md_regs = &frame; params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is not MP aware. */ mtx_lock(&Giant); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); mtx_unlock(&Giant); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin is MP aware, but the tracing code is not */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { mtx_lock(&Giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); #endif goto bad; } /* * Try to run the syscall without the MP lock if the syscall * is MP safe. We have to obtain the MP lock no matter what if * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { mtx_lock(&Giant); } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsyscall(p->p_tracep, code, narg, args); } #endif p->p_retval[0] = 0; p->p_retval[1] = frame.tf_edx; STOPEVENT(p, S_SCE, narg); /* MP aware */ error = (*callp->sy_call)(p, args); /* * MP SAFE (we may or may not have the MP lock at this point) */ switch (error) { case 0: frame.tf_eax = p->p_retval[0]; frame.tf_edx = p->p_retval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame.tf_eip -= frame.tf_err; break; case EJUSTRETURN: break; default: bad: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } /* * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(p, &frame, sticks); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { if (!mtx_owned(&Giant)) mtx_lock(&Giant); ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); /* * Release Giant if we had to get it */ if (mtx_owned(&Giant)) mtx_unlock(&Giant); #ifdef WITNESS if (witness_list(p)) { panic("system call %s returning with mutex(s) held\n", syscallnames[code]); } #endif mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } void ast(frame) struct trapframe frame; { struct proc *p = CURPROC; u_quad_t sticks; KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode")); /* * We check for a pending AST here rather than in the assembly as * acquiring and releasing mutexes in assembly is not fun. */ mtx_lock_spin(&sched_lock); if (!(astpending() || resched_wanted())) { mtx_unlock_spin(&sched_lock); return; } sticks = p->p_sticks; astoff(); mtx_intr_enable(&sched_lock); atomic_add_int(&cnt.v_soft, 1); if (p->p_sflag & PS_OWEUPC) { p->p_sflag &= ~PS_OWEUPC; mtx_unlock_spin(&sched_lock); mtx_lock(&Giant); mtx_lock_spin(&sched_lock); addupc_task(p, p->p_stats->p_prof.pr_addr, p->p_stats->p_prof.pr_ticks); } if (p->p_sflag & PS_ALRMPEND) { p->p_sflag &= ~PS_ALRMPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGVTALRM); mtx_lock_spin(&sched_lock); } if (p->p_sflag & PS_PROFPEND) { p->p_sflag &= ~PS_PROFPEND; mtx_unlock_spin(&sched_lock); if (!mtx_owned(&Giant)) mtx_lock(&Giant); psignal(p, SIGPROF); } else mtx_unlock_spin(&sched_lock); userret(p, &frame, sticks); if (mtx_owned(&Giant)) mtx_unlock(&Giant); } Index: head/sys/kern/subr_turnstile.c =================================================================== --- head/sys/kern/subr_turnstile.c (revision 72375) +++ head/sys/kern/subr_turnstile.c (revision 72376) @@ -1,1705 +1,1680 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ * $FreeBSD$ */ /* * Machine independent bits of mutex implementation and implementation of * `witness' structure & related debugging routines. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ #include "opt_ddb.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The WITNESS-enabled mutex debug structure. */ #ifdef WITNESS struct mtx_debug { struct witness *mtxd_witness; LIST_ENTRY(mtx) mtxd_held; const char *mtxd_file; int mtxd_line; }; #define mtx_held mtx_debug->mtxd_held #define mtx_file mtx_debug->mtxd_file #define mtx_line mtx_debug->mtxd_line #define mtx_witness mtx_debug->mtxd_witness #endif /* WITNESS */ /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_owner(m) (mtx_unowned((m)) ? NULL \ : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK)) #define RETIP(x) *(((uintptr_t *)(&x)) - 1) -#define SET_PRIO(p, pri) (p)->p_priority = (pri) +#define SET_PRIO(p, pri) (p)->p_pri.pri_level = (pri) /* * Early WITNESS-enabled declarations. */ #ifdef WITNESS /* * Internal WITNESS routines which must be prototyped early. * * XXX: When/if witness code is cleaned up, it would be wise to place all * witness prototyping early in this file. */ static void witness_init(struct mtx *, int flag); static void witness_destroy(struct mtx *); static void witness_display(void(*)(const char *fmt, ...)); MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure"); /* All mutexes in system (used for debug/panic) */ static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 }; /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; #else /* WITNESS */ /* XXX XXX XXX * flag++ is sleazoid way of shuting up warning */ #define witness_init(m, flag) flag++ #define witness_destroy(m) #define witness_try_enter(m, t, f, l) #endif /* WITNESS */ /* * All mutex locks in system are kept on the all_mtx list. */ static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head", TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), { NULL, NULL }, &all_mtx, &all_mtx, #ifdef WITNESS &all_mtx_debug #else NULL #endif }; /* * Global variables for book keeping. */ static int mtx_cur_cnt; static int mtx_max_cnt; /* * Couple of strings for KTR_LOCK tracing in order to avoid duplicates. */ char STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d"; /* * Prototypes for non-exported routines. * * NOTE: Prototypes for witness routines are placed at the bottom of the file. */ static void propagate_priority(struct proc *); static void propagate_priority(struct proc *p) { - int pri = p->p_priority; + int pri = p->p_pri.pri_level; struct mtx *m = p->p_blocked; mtx_assert(&sched_lock, MA_OWNED); for (;;) { struct proc *p1; p = mtx_owner(m); if (p == NULL) { /* * This really isn't quite right. Really * ought to bump priority of process that * next acquires the mutex. */ MPASS(m->mtx_lock == MTX_CONTESTED); return; } MPASS(p->p_magic == P_MAGIC); KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex")); - if (p->p_priority <= pri) + if (p->p_pri.pri_level <= pri) return; /* * Bump this process' priority. */ SET_PRIO(p, pri); /* * If lock holder is actually running, just bump priority. */ -#ifdef SMP - /* - * For SMP, we can check the p_oncpu field to see if we are - * running. - */ if (p->p_oncpu != 0xff) { MPASS(p->p_stat == SRUN || p->p_stat == SZOMB); return; } -#else + /* - * For UP, we check to see if p is curproc (this shouldn't - * ever happen however as it would mean we are in a deadlock.) - */ - if (p == curproc) { - panic("Deadlock detected"); - return; - } -#endif - /* * If on run queue move to new run queue, and * quit. */ if (p->p_stat == SRUN) { - printf("XXX: moving proc %d(%s) to a new run queue\n", - p->p_pid, p->p_comm); MPASS(p->p_blocked == NULL); remrunqueue(p); setrunqueue(p); return; } /* * If we aren't blocked on a mutex, we should be. */ KASSERT(p->p_stat == SMTX, ( "process %d(%s):%d holds %s but isn't blocked on a mutex\n", p->p_pid, p->p_comm, p->p_stat, m->mtx_description)); /* * Pick up the mutex that p is blocked on. */ m = p->p_blocked; MPASS(m != NULL); - printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid, - p->p_comm, m->mtx_description); - /* * Check if the proc needs to be moved up on * the blocked chain */ if (p == TAILQ_FIRST(&m->mtx_blocked)) { - printf("XXX: process at head of run queue\n"); continue; } - p1 = TAILQ_PREV(p, rq, p_procq); - if (p1->p_priority <= pri) { - printf( - "XXX: previous process %d(%s) has higher priority\n", - p->p_pid, p->p_comm); + p1 = TAILQ_PREV(p, procqueue, p_procq); + if (p1->p_pri.pri_level <= pri) { continue; } /* * Remove proc from blocked chain and determine where * it should be moved up to. Since we know that p1 has * a lower priority than p, we know that at least one * process in the chain has a lower priority and that * p1 will thus not be NULL after the loop. */ TAILQ_REMOVE(&m->mtx_blocked, p, p_procq); TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) { MPASS(p1->p_magic == P_MAGIC); - if (p1->p_priority > pri) + if (p1->p_pri.pri_level > pri) break; } MPASS(p1 != NULL); TAILQ_INSERT_BEFORE(p1, p, p_procq); CTR4(KTR_LOCK, "propagate_priority: p %p moved before %p on [%p] %s", p, p1, m, m->mtx_description); } } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that * if we're called, it's because we know we don't already own this lock. */ int _mtx_trylock(struct mtx *m, int opts, const char *file, int line) { int rval; MPASS(CURPROC != NULL); /* * _mtx_trylock does not accept MTX_NOSWITCH option. */ KASSERT((opts & MTX_NOSWITCH) == 0, ("mtx_trylock() called with invalid option flag(s) %d", opts)); rval = _obtain_lock(m, CURTHD); #ifdef WITNESS if (rval && m->mtx_witness != NULL) { /* * We do not handle recursion in _mtx_trylock; see the * note at the top of the routine. */ KASSERT(!mtx_recursed(m), ("mtx_trylock() called on a recursed mutex")); witness_try_enter(m, (opts | m->mtx_flags), file, line); } #endif /* WITNESS */ if ((opts & MTX_QUIET) == 0) CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d", m->mtx_description, m, rval, file, line); return rval; } /* * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p = CURPROC; if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]", m, (void *)m->mtx_lock, (void *)RETIP(m)); /* * Save our priority. Even though p_nativepri is protected by * sched_lock, we don't obtain it here as it can be expensive. * Since this is the only place p_nativepri is set, and since two * CPUs will not be executing the same process concurrently, we know * that no other CPU is going to be messing with this. Also, * p_nativepri is only read when we are blocked on a mutex, so that * can't be happening right now either. */ - p->p_nativepri = p->p_priority; + p->p_pri.pri_native = p->p_pri.pri_level; while (!_obtain_lock(m, p)) { uintptr_t v; struct proc *p1; mtx_lock_spin(&sched_lock); /* * Check if the lock has been released while spinning for * the sched_lock. */ if ((v = m->mtx_lock) == MTX_UNOWNED) { mtx_unlock_spin(&sched_lock); continue; } /* * The mutex was marked contested on release. This means that * there are processes blocked on it. */ if (v == MTX_CONTESTED) { p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p1 != NULL); m->mtx_lock = (uintptr_t)p | MTX_CONTESTED; - if (p1->p_priority < p->p_priority) - SET_PRIO(p, p1->p_priority); + if (p1->p_pri.pri_level < p->p_pri.pri_level) + SET_PRIO(p, p1->p_pri.pri_level); mtx_unlock_spin(&sched_lock); return; } /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, (void *)(v | MTX_CONTESTED))) { mtx_unlock_spin(&sched_lock); continue; } /* * We deffinately must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef notyet /* * If we're borrowing an interrupted thread's VM context, we * must clean up before going to sleep. */ if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_lock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif /* * Put us on the list of threads blocked on this mutex. */ if (TAILQ_EMPTY(&m->mtx_blocked)) { p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK); LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested); TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } else { TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) - if (p1->p_priority > p->p_priority) + if (p1->p_pri.pri_level > p->p_pri.pri_level) break; if (p1) TAILQ_INSERT_BEFORE(p1, p, p_procq); else TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } /* * Save who we're blocked on. */ p->p_blocked = m; p->p_mtxname = m->mtx_description; p->p_stat = SMTX; -#if 0 propagate_priority(p); -#endif if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m, m->mtx_description); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p free from blocked on [%p] %s", p, m, m->mtx_description); mtx_unlock_spin(&sched_lock); } return; } /* * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file, int line) { int i = 0; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); for (;;) { if (_obtain_lock(m, CURPROC)) break; while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 1000000) continue; if (i++ < 6000000) DELAY(1); #ifdef DDB else if (!db_active) #else else #endif panic("spin lock %s held by %p for > 5 seconds", m->mtx_description, (void *)m->mtx_lock); } } m->mtx_saveintr = mtx_intr; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); return; } /* * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p, *p1; struct mtx *m1; int pri; p = CURPROC; MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line); if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } mtx_lock_spin(&sched_lock); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p->p_magic == P_MAGIC); MPASS(p1->p_magic == P_MAGIC); TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq); if (TAILQ_EMPTY(&m->mtx_blocked)) { LIST_REMOVE(m, mtx_contested); _release_lock_quick(m); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); } else atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED); - pri = MAXPRI; + pri = PRI_MAX; LIST_FOREACH(m1, &p->p_contested, mtx_contested) { - int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority; + int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level; if (cp < pri) pri = cp; } - if (pri > p->p_nativepri) - pri = p->p_nativepri; + if (pri > p->p_pri.pri_native) + pri = p->p_pri.pri_native; SET_PRIO(p, pri); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p", m, p1); p1->p_blocked = NULL; p1->p_mtxname = NULL; p1->p_stat = SRUN; setrunqueue(p1); - if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) { + if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) { #ifdef notyet if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif setrunqueue(p); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p switching out lock=%p", m, (void *)m->mtx_lock); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } mtx_unlock_spin(&sched_lock); return; } /* * All the unlocking of MTX_SPIN locks is done inline. * See the _rel_spin_lock() macro for the details. */ /* * The INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANTS void _mtx_assert(struct mtx *m, int what, const char *file, int line) { switch ((what)) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned((m))) panic("mutex %s not owned at %s:%d", (m)->mtx_description, file, line); if (mtx_recursed((m))) { if (((what) & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", (m)->mtx_description, file, line); } else if (((what) & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", (m)->mtx_description, file, line); } break; case MA_NOTOWNED: if (mtx_owned((m))) panic("mutex %s owned at %s:%d", (m)->mtx_description, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * The MUTEX_DEBUG-enabled mtx_validate() */ #define MV_DESTROY 0 /* validate before destory */ #define MV_INIT 1 /* validate before init */ #ifdef MUTEX_DEBUG int mtx_validate __P((struct mtx *, int)); int mtx_validate(struct mtx *m, int when) { struct mtx *mp; int i; int retval = 0; #ifdef WITNESS if (witness_cold) return 0; #endif if (m == &all_mtx || cold) return 0; mtx_lock(&all_mtx); /* * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly * we can re-enable the kernacc() checks. */ #ifndef __alpha__ MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t), VM_PROT_READ) == 1); #endif MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx); for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { #ifndef __alpha__ if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t), VM_PROT_READ) != 1) { panic("mtx_validate: mp=%p mp->mtx_next=%p", mp, mp->mtx_next); } #endif i++; if (i > mtx_cur_cnt) { panic("mtx_validate: too many in chain, known=%d\n", mtx_cur_cnt); } } MPASS(i == mtx_cur_cnt); switch (when) { case MV_DESTROY: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) break; MPASS(mp == m); break; case MV_INIT: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) { /* * Not good. This mutex already exists. */ printf("re-initing existing mutex %s\n", m->mtx_description); MPASS(m->mtx_lock == MTX_UNOWNED); retval = 1; } } mtx_unlock(&all_mtx); return (retval); } #endif /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and description `description.' * Place on "all_mtx" queue. */ void mtx_init(struct mtx *m, const char *description, int opts) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description); #ifdef MUTEX_DEBUG /* Diagnostic and error correction */ if (mtx_validate(m, MV_INIT)) return; #endif bzero((void *)m, sizeof *m); TAILQ_INIT(&m->mtx_blocked); #ifdef WITNESS if (!witness_cold) { m->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(m->mtx_debug != NULL); } #endif m->mtx_description = description; m->mtx_flags = opts; m->mtx_lock = MTX_UNOWNED; /* Put on all mutex queue */ mtx_lock(&all_mtx); m->mtx_next = &all_mtx; m->mtx_prev = all_mtx.mtx_prev; m->mtx_prev->mtx_next = m; all_mtx.mtx_prev = m; if (++mtx_cur_cnt > mtx_max_cnt) mtx_max_cnt = mtx_cur_cnt; mtx_unlock(&all_mtx); #ifdef WITNESS if (!witness_cold) witness_init(m, opts); #endif } /* * Remove lock `m' from all_mtx queue. */ void mtx_destroy(struct mtx *m) { #ifdef WITNESS KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n", __FUNCTION__)); #endif CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description); #ifdef MUTEX_DEBUG if (m->mtx_next == NULL) panic("mtx_destroy: %p (%s) already destroyed", m, m->mtx_description); if (!mtx_owned(m)) { MPASS(m->mtx_lock == MTX_UNOWNED); } else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); } /* diagnostic */ mtx_validate(m, MV_DESTROY); #endif #ifdef WITNESS if (m->mtx_witness) witness_destroy(m); #endif /* WITNESS */ /* Remove from the all mutex queue */ mtx_lock(&all_mtx); m->mtx_next->mtx_prev = m->mtx_prev; m->mtx_prev->mtx_next = m->mtx_next; #ifdef MUTEX_DEBUG m->mtx_next = m->mtx_prev = NULL; #endif #ifdef WITNESS free(m->mtx_debug, M_WITNESS); m->mtx_debug = NULL; #endif mtx_cur_cnt--; mtx_unlock(&all_mtx); } /* * The WITNESS-enabled diagnostic code. */ #ifdef WITNESS static void witness_fixup(void *dummy __unused) { struct mtx *mp; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); mtx_lock(&all_mtx); /* Iterate through all mutexes and finish up mutex initialization. */ for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { mp->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(mp->mtx_debug != NULL); witness_init(mp, mp->mtx_flags); } mtx_unlock(&all_mtx); /* Mark the witness code as being ready for use. */ atomic_store_rel_int(&witness_cold, 0); mtx_lock(&Giant); } SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL) #define WITNESS_COUNT 200 #define WITNESS_NCHILDREN 2 int witness_watch = 1; struct witness { struct witness *w_next; const char *w_description; const char *w_file; int w_line; struct witness *w_morechildren; u_char w_childcnt; u_char w_Giant_squawked:1; u_char w_other_squawked:1; u_char w_same_squawked:1; u_char w_spin:1; /* MTX_SPIN type mutex. */ u_int w_level; struct witness *w_children[WITNESS_NCHILDREN]; }; struct witness_blessed { char *b_lock1; char *b_lock2; }; #ifdef DDB /* * When DDB is enabled and witness_ddb is set to 1, it will cause the system to * drop into kdebug() when: * - a lock heirarchy violation occurs * - locks are held when going to sleep. */ int witness_ddb; #ifdef WITNESS_DDB TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb); #else TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb); #endif SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, ""); #endif /* DDB */ int witness_skipspin; #ifdef WITNESS_SKIPSPIN TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin); #else TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin); #endif SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0, ""); /* * Witness-enabled globals */ static struct mtx w_mtx; static struct witness *w_free; static struct witness *w_all; static int w_inited; static int witness_dead; /* fatal error, probably no memory */ static struct witness w_data[WITNESS_COUNT]; /* * Internal witness routine prototypes */ static struct witness *enroll(const char *description, int flag); static int itismychild(struct witness *parent, struct witness *child); static void removechild(struct witness *parent, struct witness *child); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static int dup_ok(struct witness *); static int blessed(struct witness *, struct witness *); static void witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *); static void witness_leveldescendents(struct witness *parent, int level); static void witness_levelall(void); static struct witness * witness_get(void); static void witness_free(struct witness *m); static char *ignore_list[] = { "witness lock", NULL }; static char *spin_order_list[] = { #if defined(__i386__) && defined (SMP) "com", #endif "sio", #ifdef __i386__ "cy", #endif "sched lock", #ifdef __i386__ "clk", #endif "callout", /* * leaf locks */ "ithread table lock", "ithread list lock", #ifdef SMP #ifdef __i386__ "ap boot", "imen", #endif "smp rendezvous", #endif NULL }; static char *order_list[] = { "Giant", "proctree", "allproc", "process lock", "uidinfo hash", "uidinfo struct", NULL, NULL }; static char *dup_list[] = { NULL }; static char *sleep_list[] = { "Giant", NULL }; /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed); static void witness_init(struct mtx *m, int flag) { m->mtx_witness = enroll(m->mtx_description, flag); } static void witness_destroy(struct mtx *m) { struct mtx *m1; struct proc *p; p = CURPROC; LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) { if (m1 == m) { LIST_REMOVE(m, mtx_held); break; } } return; } static void witness_display(void(*prnt)(const char *fmt, ...)) { struct witness *w, *w1; int level, found; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); witness_levelall(); /* * First, handle sleep mutexes which have been acquired at least * once. */ prnt("Sleep mutexes:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; /* * This lock has no anscestors, display its descendants. */ witness_displaydescendants(prnt, w); } /* * Now do spin mutexes which have been acquired at least once. */ prnt("\nSpin mutexes:\n"); level = 0; while (level < sizeof(spin_order_list) / sizeof(char *)) { found = 0; for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || !w->w_spin) continue; if (w->w_level == 1 << level) { witness_displaydescendants(prnt, w); level++; found = 1; } } if (found == 0) level++; } /* * Finally, any mutexes which have not been acquired yet. */ prnt("\nMutexes which were never acquired:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file != NULL) continue; prnt("%s\n", w->w_description); } } void witness_enter(struct mtx *m, int flags, const char *file, int line) { struct witness *w, *w1; struct mtx *m1; struct proc *p; int i; #ifdef DDB int go_into_ddb = 0; #endif /* DDB */ if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; p = CURPROC; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); i = PCPU_GET(witness_spin_check); if (i != 0 && w->w_level < i) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); panic("mutex_enter(%s:%x, MTX_SPIN) out of order @" " %s:%d already holding %s:%x", m->mtx_description, w->w_level, file, line, spin_order_list[ffs(i)-1], i); } PCPU_SET(witness_spin_check, i | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if (witness_dead) goto out; if (cold) goto out; if (!mtx_legal2block()) panic("blockable mtx_lock() of %s when not legal @ %s:%d", m->mtx_description, file, line); /* * Is this the first mutex acquired */ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) goto out; if ((w1 = m1->mtx_witness) == w) { if (w->w_same_squawked || dup_ok(w)) goto out; w->w_same_squawked = 1; printf("acquring duplicate lock of same type: \"%s\"\n", m->mtx_description); printf(" 1st @ %s:%d\n", w->w_file, w->w_line); printf(" 2nd @ %s:%d\n", file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } MPASS(!mtx_owned(&w_mtx)); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); /* * If we have a known higher number just say ok */ if (witness_watch > 1 && w->w_level > w1->w_level) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } if (isitmydescendant(m1->mtx_witness, w)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { MPASS(i < 200); w1 = m1->mtx_witness; if (isitmydescendant(w, w1)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (blessed(w, w1)) goto out; if (m1 == &Giant) { if (w1->w_Giant_squawked) goto out; else w1->w_Giant_squawked = 1; } else { if (w1->w_other_squawked) goto out; else w1->w_other_squawked = 1; } printf("lock order reversal\n"); printf(" 1st %s last acquired @ %s:%d\n", w->w_description, w->w_file, w->w_line); printf(" 2nd %p %s @ %s:%d\n", m1, w1->w_description, w1->w_file, w1->w_line); printf(" 3rd %p %s @ %s:%d\n", m, w->w_description, file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } } m1 = LIST_FIRST(&p->p_heldmtx); if (!itismychild(m1->mtx_witness, w)) mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); out: #ifdef DDB if (witness_ddb && go_into_ddb) Debugger("witness_enter"); #endif /* DDB */ w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; /* * If this pays off it likely means that a mutex being witnessed * is acquired in hardclock. Put it in the ignore list. It is * likely not the mutex this assert fails on. */ MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_try_enter(struct mtx *m, int flags, const char *file, int line) { struct proc *p; struct witness *w = m->mtx_witness; if (witness_cold) return; if (panicstr) return; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_try_enter: " "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on" " non-recursive mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; p = CURPROC; MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_exit(struct mtx *m, int flags, const char *file, int line) { struct witness *w; if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) & ~w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) panic("switchable mtx_unlock() of %s when not legal @ %s:%d", m->mtx_description, file, line); LIST_REMOVE(m, mtx_held); m->mtx_held.le_prev = NULL; } int witness_sleep(int check_only, struct mtx *mtx, const char *file, int line) { struct mtx *m; struct proc *p; char **sleep; int n = 0; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); p = CURPROC; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { if (m == mtx) continue; for (sleep = sleep_list; *sleep!= NULL; sleep++) if (strcmp(m->mtx_description, *sleep) == 0) goto next; if (n == 0) printf("Whee!\n"); printf("%s:%d: %s with \"%s\" locked from %s:%d\n", file, line, check_only ? "could sleep" : "sleeping", m->mtx_description, m->mtx_witness->w_file, m->mtx_witness->w_line); n++; next: } #ifdef DDB if (witness_ddb && n) Debugger("witness_sleep"); #endif /* DDB */ return (n); } static struct witness * enroll(const char *description, int flag) { int i; struct witness *w, *w1; char **ignore; char **order; if (!witness_watch) return (NULL); for (ignore = ignore_list; *ignore != NULL; ignore++) if (strcmp(description, *ignore) == 0) return (NULL); if (w_inited == 0) { mtx_init(&w_mtx, "witness lock", MTX_SPIN); for (i = 0; i < WITNESS_COUNT; i++) { w = &w_data[i]; witness_free(w); } w_inited = 1; for (order = order_list; *order != NULL; order++) { w = enroll(*order, MTX_DEF); w->w_file = "order list"; for (order++; *order != NULL; order++) { w1 = enroll(*order, MTX_DEF); w1->w_file = "order list"; itismychild(w, w1); w = w1; } } } if ((flag & MTX_SPIN) && witness_skipspin) return (NULL); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); for (w = w_all; w; w = w->w_next) { if (strcmp(description, w->w_description) == 0) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return (w); } } if ((w = witness_get()) == NULL) return (NULL); w->w_next = w_all; w_all = w; w->w_description = description; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (flag & MTX_SPIN) { w->w_spin = 1; i = 1; for (order = spin_order_list; *order != NULL; order++) { if (strcmp(description, *order) == 0) break; i <<= 1; } if (*order == NULL) panic("spin lock %s not in order list", description); w->w_level = i; } return (w); } static int itismychild(struct witness *parent, struct witness *child) { static int recursed; /* * Insert "child" after "parent" */ while (parent->w_morechildren) parent = parent->w_morechildren; if (parent->w_childcnt == WITNESS_NCHILDREN) { if ((parent->w_morechildren = witness_get()) == NULL) return (1); parent = parent->w_morechildren; } MPASS(child != NULL); parent->w_children[parent->w_childcnt++] = child; /* * now prune whole tree */ if (recursed) return (0); recursed = 1; for (child = w_all; child != NULL; child = child->w_next) { for (parent = w_all; parent != NULL; parent = parent->w_next) { if (!isitmychild(parent, child)) continue; removechild(parent, child); if (isitmydescendant(parent, child)) continue; itismychild(parent, child); } } recursed = 0; witness_levelall(); return (0); } static void removechild(struct witness *parent, struct witness *child) { struct witness *w, *w1; int i; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) if (w->w_children[i] == child) goto found; return; found: for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) continue; w->w_children[i] = w1->w_children[--w1->w_childcnt]; MPASS(w->w_children[i] != NULL); if (w1->w_childcnt != 0) return; if (w1 == parent) return; for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) continue; w->w_morechildren = 0; witness_free(w1); } static int isitmychild(struct witness *parent, struct witness *child) { struct witness *w; int i; for (w = parent; w != NULL; w = w->w_morechildren) { for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } } return (0); } static int isitmydescendant(struct witness *parent, struct witness *child) { struct witness *w; int i; int j; for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { MPASS(j < 1000); for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } for (i = 0; i < w->w_childcnt; i++) { if (isitmydescendant(w->w_children[i], child)) return (1); } } return (0); } void witness_levelall (void) { struct witness *w, *w1; for (w = w_all; w; w = w->w_next) if (!(w->w_spin)) w->w_level = 0; for (w = w_all; w; w = w->w_next) { if (w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; witness_leveldescendents(w, 0); } } static void witness_leveldescendents(struct witness *parent, int level) { int i; struct witness *w; if (parent->w_level < level) parent->w_level = level; level++; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_leveldescendents(w->w_children[i], level); } static void witness_displaydescendants(void(*prnt)(const char *fmt, ...), struct witness *parent) { struct witness *w; int i; int level; level = parent->w_spin ? ffs(parent->w_level) : parent->w_level; prnt("%d", level); if (level < 10) prnt(" "); for (i = 0; i < level; i++) prnt(" "); prnt("%s", parent->w_description); if (parent->w_file != NULL) prnt(" -- last acquired @ %s:%d\n", parent->w_file, parent->w_line); for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_displaydescendants(prnt, w->w_children[i]); } static int dup_ok(struct witness *w) { char **dup; for (dup = dup_list; *dup!= NULL; dup++) if (strcmp(w->w_description, *dup) == 0) return (1); return (0); } static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < blessed_count; i++) { b = &blessed_list[i]; if (strcmp(w1->w_description, b->b_lock1) == 0) { if (strcmp(w2->w_description, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_description, b->b_lock2) == 0) if (strcmp(w2->w_description, b->b_lock1) == 0) return (1); } return (0); } static struct witness * witness_get() { struct witness *w; if ((w = w_free) == NULL) { witness_dead = 1; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); printf("witness exhausted\n"); return (NULL); } w_free = w->w_next; bzero(w, sizeof(*w)); return (w); } static void witness_free(struct witness *w) { w->w_next = w_free; w_free = w; } int witness_list(struct proc *p) { struct mtx *m; int nheld; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); nheld = 0; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { printf("\t\"%s\" (%p) locked at %s:%d\n", m->mtx_description, m, m->mtx_witness->w_file, m->mtx_witness->w_line); nheld++; } return (nheld); } #ifdef DDB DB_SHOW_COMMAND(mutexes, db_witness_list) { witness_list(CURPROC); } DB_SHOW_COMMAND(witness, db_witness_display) { witness_display(db_printf); } #endif void witness_save(struct mtx *m, const char **filep, int *linep) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; *filep = m->mtx_witness->w_file; *linep = m->mtx_witness->w_line; } void witness_restore(struct mtx *m, const char *file, int line) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; m->mtx_witness->w_file = file; m->mtx_witness->w_line = line; } #endif /* WITNESS */ Index: head/sys/kern/subr_witness.c =================================================================== --- head/sys/kern/subr_witness.c (revision 72375) +++ head/sys/kern/subr_witness.c (revision 72376) @@ -1,1705 +1,1680 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ * $FreeBSD$ */ /* * Machine independent bits of mutex implementation and implementation of * `witness' structure & related debugging routines. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ #include "opt_ddb.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The WITNESS-enabled mutex debug structure. */ #ifdef WITNESS struct mtx_debug { struct witness *mtxd_witness; LIST_ENTRY(mtx) mtxd_held; const char *mtxd_file; int mtxd_line; }; #define mtx_held mtx_debug->mtxd_held #define mtx_file mtx_debug->mtxd_file #define mtx_line mtx_debug->mtxd_line #define mtx_witness mtx_debug->mtxd_witness #endif /* WITNESS */ /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_owner(m) (mtx_unowned((m)) ? NULL \ : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK)) #define RETIP(x) *(((uintptr_t *)(&x)) - 1) -#define SET_PRIO(p, pri) (p)->p_priority = (pri) +#define SET_PRIO(p, pri) (p)->p_pri.pri_level = (pri) /* * Early WITNESS-enabled declarations. */ #ifdef WITNESS /* * Internal WITNESS routines which must be prototyped early. * * XXX: When/if witness code is cleaned up, it would be wise to place all * witness prototyping early in this file. */ static void witness_init(struct mtx *, int flag); static void witness_destroy(struct mtx *); static void witness_display(void(*)(const char *fmt, ...)); MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure"); /* All mutexes in system (used for debug/panic) */ static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 }; /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; #else /* WITNESS */ /* XXX XXX XXX * flag++ is sleazoid way of shuting up warning */ #define witness_init(m, flag) flag++ #define witness_destroy(m) #define witness_try_enter(m, t, f, l) #endif /* WITNESS */ /* * All mutex locks in system are kept on the all_mtx list. */ static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head", TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), { NULL, NULL }, &all_mtx, &all_mtx, #ifdef WITNESS &all_mtx_debug #else NULL #endif }; /* * Global variables for book keeping. */ static int mtx_cur_cnt; static int mtx_max_cnt; /* * Couple of strings for KTR_LOCK tracing in order to avoid duplicates. */ char STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d"; char STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d"; char STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d"; /* * Prototypes for non-exported routines. * * NOTE: Prototypes for witness routines are placed at the bottom of the file. */ static void propagate_priority(struct proc *); static void propagate_priority(struct proc *p) { - int pri = p->p_priority; + int pri = p->p_pri.pri_level; struct mtx *m = p->p_blocked; mtx_assert(&sched_lock, MA_OWNED); for (;;) { struct proc *p1; p = mtx_owner(m); if (p == NULL) { /* * This really isn't quite right. Really * ought to bump priority of process that * next acquires the mutex. */ MPASS(m->mtx_lock == MTX_CONTESTED); return; } MPASS(p->p_magic == P_MAGIC); KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex")); - if (p->p_priority <= pri) + if (p->p_pri.pri_level <= pri) return; /* * Bump this process' priority. */ SET_PRIO(p, pri); /* * If lock holder is actually running, just bump priority. */ -#ifdef SMP - /* - * For SMP, we can check the p_oncpu field to see if we are - * running. - */ if (p->p_oncpu != 0xff) { MPASS(p->p_stat == SRUN || p->p_stat == SZOMB); return; } -#else + /* - * For UP, we check to see if p is curproc (this shouldn't - * ever happen however as it would mean we are in a deadlock.) - */ - if (p == curproc) { - panic("Deadlock detected"); - return; - } -#endif - /* * If on run queue move to new run queue, and * quit. */ if (p->p_stat == SRUN) { - printf("XXX: moving proc %d(%s) to a new run queue\n", - p->p_pid, p->p_comm); MPASS(p->p_blocked == NULL); remrunqueue(p); setrunqueue(p); return; } /* * If we aren't blocked on a mutex, we should be. */ KASSERT(p->p_stat == SMTX, ( "process %d(%s):%d holds %s but isn't blocked on a mutex\n", p->p_pid, p->p_comm, p->p_stat, m->mtx_description)); /* * Pick up the mutex that p is blocked on. */ m = p->p_blocked; MPASS(m != NULL); - printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid, - p->p_comm, m->mtx_description); - /* * Check if the proc needs to be moved up on * the blocked chain */ if (p == TAILQ_FIRST(&m->mtx_blocked)) { - printf("XXX: process at head of run queue\n"); continue; } - p1 = TAILQ_PREV(p, rq, p_procq); - if (p1->p_priority <= pri) { - printf( - "XXX: previous process %d(%s) has higher priority\n", - p->p_pid, p->p_comm); + p1 = TAILQ_PREV(p, procqueue, p_procq); + if (p1->p_pri.pri_level <= pri) { continue; } /* * Remove proc from blocked chain and determine where * it should be moved up to. Since we know that p1 has * a lower priority than p, we know that at least one * process in the chain has a lower priority and that * p1 will thus not be NULL after the loop. */ TAILQ_REMOVE(&m->mtx_blocked, p, p_procq); TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) { MPASS(p1->p_magic == P_MAGIC); - if (p1->p_priority > pri) + if (p1->p_pri.pri_level > pri) break; } MPASS(p1 != NULL); TAILQ_INSERT_BEFORE(p1, p, p_procq); CTR4(KTR_LOCK, "propagate_priority: p %p moved before %p on [%p] %s", p, p1, m, m->mtx_description); } } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that * if we're called, it's because we know we don't already own this lock. */ int _mtx_trylock(struct mtx *m, int opts, const char *file, int line) { int rval; MPASS(CURPROC != NULL); /* * _mtx_trylock does not accept MTX_NOSWITCH option. */ KASSERT((opts & MTX_NOSWITCH) == 0, ("mtx_trylock() called with invalid option flag(s) %d", opts)); rval = _obtain_lock(m, CURTHD); #ifdef WITNESS if (rval && m->mtx_witness != NULL) { /* * We do not handle recursion in _mtx_trylock; see the * note at the top of the routine. */ KASSERT(!mtx_recursed(m), ("mtx_trylock() called on a recursed mutex")); witness_try_enter(m, (opts | m->mtx_flags), file, line); } #endif /* WITNESS */ if ((opts & MTX_QUIET) == 0) CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d", m->mtx_description, m, rval, file, line); return rval; } /* * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p = CURPROC; if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]", m, (void *)m->mtx_lock, (void *)RETIP(m)); /* * Save our priority. Even though p_nativepri is protected by * sched_lock, we don't obtain it here as it can be expensive. * Since this is the only place p_nativepri is set, and since two * CPUs will not be executing the same process concurrently, we know * that no other CPU is going to be messing with this. Also, * p_nativepri is only read when we are blocked on a mutex, so that * can't be happening right now either. */ - p->p_nativepri = p->p_priority; + p->p_pri.pri_native = p->p_pri.pri_level; while (!_obtain_lock(m, p)) { uintptr_t v; struct proc *p1; mtx_lock_spin(&sched_lock); /* * Check if the lock has been released while spinning for * the sched_lock. */ if ((v = m->mtx_lock) == MTX_UNOWNED) { mtx_unlock_spin(&sched_lock); continue; } /* * The mutex was marked contested on release. This means that * there are processes blocked on it. */ if (v == MTX_CONTESTED) { p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p1 != NULL); m->mtx_lock = (uintptr_t)p | MTX_CONTESTED; - if (p1->p_priority < p->p_priority) - SET_PRIO(p, p1->p_priority); + if (p1->p_pri.pri_level < p->p_pri.pri_level) + SET_PRIO(p, p1->p_pri.pri_level); mtx_unlock_spin(&sched_lock); return; } /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, (void *)v, (void *)(v | MTX_CONTESTED))) { mtx_unlock_spin(&sched_lock); continue; } /* * We deffinately must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef notyet /* * If we're borrowing an interrupted thread's VM context, we * must clean up before going to sleep. */ if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_lock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif /* * Put us on the list of threads blocked on this mutex. */ if (TAILQ_EMPTY(&m->mtx_blocked)) { p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK); LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested); TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } else { TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) - if (p1->p_priority > p->p_priority) + if (p1->p_pri.pri_level > p->p_pri.pri_level) break; if (p1) TAILQ_INSERT_BEFORE(p1, p, p_procq); else TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); } /* * Save who we're blocked on. */ p->p_blocked = m; p->p_mtxname = m->mtx_description; p->p_stat = SMTX; -#if 0 propagate_priority(p); -#endif if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m, m->mtx_description); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR3(KTR_LOCK, "_mtx_lock_sleep: p %p free from blocked on [%p] %s", p, m, m->mtx_description); mtx_unlock_spin(&sched_lock); } return; } /* * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file, int line) { int i = 0; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); for (;;) { if (_obtain_lock(m, CURPROC)) break; while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 1000000) continue; if (i++ < 6000000) DELAY(1); #ifdef DDB else if (!db_active) #else else #endif panic("spin lock %s held by %p for > 5 seconds", m->mtx_description, (void *)m->mtx_lock); } } m->mtx_saveintr = mtx_intr; if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); return; } /* * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line) { struct proc *p, *p1; struct mtx *m1; int pri; p = CURPROC; MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line); if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } mtx_lock_spin(&sched_lock); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); p1 = TAILQ_FIRST(&m->mtx_blocked); MPASS(p->p_magic == P_MAGIC); MPASS(p1->p_magic == P_MAGIC); TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq); if (TAILQ_EMPTY(&m->mtx_blocked)) { LIST_REMOVE(m, mtx_contested); _release_lock_quick(m); if ((opts & MTX_QUIET) == 0) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m); } else atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED); - pri = MAXPRI; + pri = PRI_MAX; LIST_FOREACH(m1, &p->p_contested, mtx_contested) { - int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority; + int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level; if (cp < pri) pri = cp; } - if (pri > p->p_nativepri) - pri = p->p_nativepri; + if (pri > p->p_pri.pri_native) + pri = p->p_pri.pri_native; SET_PRIO(p, pri); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p", m, p1); p1->p_blocked = NULL; p1->p_mtxname = NULL; p1->p_stat = SRUN; setrunqueue(p1); - if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) { + if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) { #ifdef notyet if (p->p_flag & (P_ITHD | P_SITHD)) { ithd_t *it = (ithd_t *)p; if (it->it_interrupted) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: 0x%x interrupted 0x%x", it, it->it_interrupted); intr_thd_fixup(it); } } #endif setrunqueue(p); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p switching out lock=%p", m, (void *)m->mtx_lock); mi_switch(); if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p", m, (void *)m->mtx_lock); } mtx_unlock_spin(&sched_lock); return; } /* * All the unlocking of MTX_SPIN locks is done inline. * See the _rel_spin_lock() macro for the details. */ /* * The INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANTS void _mtx_assert(struct mtx *m, int what, const char *file, int line) { switch ((what)) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned((m))) panic("mutex %s not owned at %s:%d", (m)->mtx_description, file, line); if (mtx_recursed((m))) { if (((what) & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", (m)->mtx_description, file, line); } else if (((what) & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", (m)->mtx_description, file, line); } break; case MA_NOTOWNED: if (mtx_owned((m))) panic("mutex %s owned at %s:%d", (m)->mtx_description, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * The MUTEX_DEBUG-enabled mtx_validate() */ #define MV_DESTROY 0 /* validate before destory */ #define MV_INIT 1 /* validate before init */ #ifdef MUTEX_DEBUG int mtx_validate __P((struct mtx *, int)); int mtx_validate(struct mtx *m, int when) { struct mtx *mp; int i; int retval = 0; #ifdef WITNESS if (witness_cold) return 0; #endif if (m == &all_mtx || cold) return 0; mtx_lock(&all_mtx); /* * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly * we can re-enable the kernacc() checks. */ #ifndef __alpha__ MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t), VM_PROT_READ) == 1); #endif MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx); for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { #ifndef __alpha__ if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t), VM_PROT_READ) != 1) { panic("mtx_validate: mp=%p mp->mtx_next=%p", mp, mp->mtx_next); } #endif i++; if (i > mtx_cur_cnt) { panic("mtx_validate: too many in chain, known=%d\n", mtx_cur_cnt); } } MPASS(i == mtx_cur_cnt); switch (when) { case MV_DESTROY: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) break; MPASS(mp == m); break; case MV_INIT: for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) if (mp == m) { /* * Not good. This mutex already exists. */ printf("re-initing existing mutex %s\n", m->mtx_description); MPASS(m->mtx_lock == MTX_UNOWNED); retval = 1; } } mtx_unlock(&all_mtx); return (retval); } #endif /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and description `description.' * Place on "all_mtx" queue. */ void mtx_init(struct mtx *m, const char *description, int opts) { if ((opts & MTX_QUIET) == 0) CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description); #ifdef MUTEX_DEBUG /* Diagnostic and error correction */ if (mtx_validate(m, MV_INIT)) return; #endif bzero((void *)m, sizeof *m); TAILQ_INIT(&m->mtx_blocked); #ifdef WITNESS if (!witness_cold) { m->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(m->mtx_debug != NULL); } #endif m->mtx_description = description; m->mtx_flags = opts; m->mtx_lock = MTX_UNOWNED; /* Put on all mutex queue */ mtx_lock(&all_mtx); m->mtx_next = &all_mtx; m->mtx_prev = all_mtx.mtx_prev; m->mtx_prev->mtx_next = m; all_mtx.mtx_prev = m; if (++mtx_cur_cnt > mtx_max_cnt) mtx_max_cnt = mtx_cur_cnt; mtx_unlock(&all_mtx); #ifdef WITNESS if (!witness_cold) witness_init(m, opts); #endif } /* * Remove lock `m' from all_mtx queue. */ void mtx_destroy(struct mtx *m) { #ifdef WITNESS KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n", __FUNCTION__)); #endif CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description); #ifdef MUTEX_DEBUG if (m->mtx_next == NULL) panic("mtx_destroy: %p (%s) already destroyed", m, m->mtx_description); if (!mtx_owned(m)) { MPASS(m->mtx_lock == MTX_UNOWNED); } else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); } /* diagnostic */ mtx_validate(m, MV_DESTROY); #endif #ifdef WITNESS if (m->mtx_witness) witness_destroy(m); #endif /* WITNESS */ /* Remove from the all mutex queue */ mtx_lock(&all_mtx); m->mtx_next->mtx_prev = m->mtx_prev; m->mtx_prev->mtx_next = m->mtx_next; #ifdef MUTEX_DEBUG m->mtx_next = m->mtx_prev = NULL; #endif #ifdef WITNESS free(m->mtx_debug, M_WITNESS); m->mtx_debug = NULL; #endif mtx_cur_cnt--; mtx_unlock(&all_mtx); } /* * The WITNESS-enabled diagnostic code. */ #ifdef WITNESS static void witness_fixup(void *dummy __unused) { struct mtx *mp; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); mtx_lock(&all_mtx); /* Iterate through all mutexes and finish up mutex initialization. */ for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { mp->mtx_debug = malloc(sizeof(struct mtx_debug), M_WITNESS, M_NOWAIT | M_ZERO); MPASS(mp->mtx_debug != NULL); witness_init(mp, mp->mtx_flags); } mtx_unlock(&all_mtx); /* Mark the witness code as being ready for use. */ atomic_store_rel_int(&witness_cold, 0); mtx_lock(&Giant); } SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL) #define WITNESS_COUNT 200 #define WITNESS_NCHILDREN 2 int witness_watch = 1; struct witness { struct witness *w_next; const char *w_description; const char *w_file; int w_line; struct witness *w_morechildren; u_char w_childcnt; u_char w_Giant_squawked:1; u_char w_other_squawked:1; u_char w_same_squawked:1; u_char w_spin:1; /* MTX_SPIN type mutex. */ u_int w_level; struct witness *w_children[WITNESS_NCHILDREN]; }; struct witness_blessed { char *b_lock1; char *b_lock2; }; #ifdef DDB /* * When DDB is enabled and witness_ddb is set to 1, it will cause the system to * drop into kdebug() when: * - a lock heirarchy violation occurs * - locks are held when going to sleep. */ int witness_ddb; #ifdef WITNESS_DDB TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb); #else TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb); #endif SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, ""); #endif /* DDB */ int witness_skipspin; #ifdef WITNESS_SKIPSPIN TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin); #else TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin); #endif SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0, ""); /* * Witness-enabled globals */ static struct mtx w_mtx; static struct witness *w_free; static struct witness *w_all; static int w_inited; static int witness_dead; /* fatal error, probably no memory */ static struct witness w_data[WITNESS_COUNT]; /* * Internal witness routine prototypes */ static struct witness *enroll(const char *description, int flag); static int itismychild(struct witness *parent, struct witness *child); static void removechild(struct witness *parent, struct witness *child); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static int dup_ok(struct witness *); static int blessed(struct witness *, struct witness *); static void witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *); static void witness_leveldescendents(struct witness *parent, int level); static void witness_levelall(void); static struct witness * witness_get(void); static void witness_free(struct witness *m); static char *ignore_list[] = { "witness lock", NULL }; static char *spin_order_list[] = { #if defined(__i386__) && defined (SMP) "com", #endif "sio", #ifdef __i386__ "cy", #endif "sched lock", #ifdef __i386__ "clk", #endif "callout", /* * leaf locks */ "ithread table lock", "ithread list lock", #ifdef SMP #ifdef __i386__ "ap boot", "imen", #endif "smp rendezvous", #endif NULL }; static char *order_list[] = { "Giant", "proctree", "allproc", "process lock", "uidinfo hash", "uidinfo struct", NULL, NULL }; static char *dup_list[] = { NULL }; static char *sleep_list[] = { "Giant", NULL }; /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed); static void witness_init(struct mtx *m, int flag) { m->mtx_witness = enroll(m->mtx_description, flag); } static void witness_destroy(struct mtx *m) { struct mtx *m1; struct proc *p; p = CURPROC; LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) { if (m1 == m) { LIST_REMOVE(m, mtx_held); break; } } return; } static void witness_display(void(*prnt)(const char *fmt, ...)) { struct witness *w, *w1; int level, found; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); witness_levelall(); /* * First, handle sleep mutexes which have been acquired at least * once. */ prnt("Sleep mutexes:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; /* * This lock has no anscestors, display its descendants. */ witness_displaydescendants(prnt, w); } /* * Now do spin mutexes which have been acquired at least once. */ prnt("\nSpin mutexes:\n"); level = 0; while (level < sizeof(spin_order_list) / sizeof(char *)) { found = 0; for (w = w_all; w; w = w->w_next) { if (w->w_file == NULL || !w->w_spin) continue; if (w->w_level == 1 << level) { witness_displaydescendants(prnt, w); level++; found = 1; } } if (found == 0) level++; } /* * Finally, any mutexes which have not been acquired yet. */ prnt("\nMutexes which were never acquired:\n"); for (w = w_all; w; w = w->w_next) { if (w->w_file != NULL) continue; prnt("%s\n", w->w_description); } } void witness_enter(struct mtx *m, int flags, const char *file, int line) { struct witness *w, *w1; struct mtx *m1; struct proc *p; int i; #ifdef DDB int go_into_ddb = 0; #endif /* DDB */ if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; p = CURPROC; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); i = PCPU_GET(witness_spin_check); if (i != 0 && w->w_level < i) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); panic("mutex_enter(%s:%x, MTX_SPIN) out of order @" " %s:%d already holding %s:%x", m->mtx_description, w->w_level, file, line, spin_order_list[ffs(i)-1], i); } PCPU_SET(witness_spin_check, i | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if (witness_dead) goto out; if (cold) goto out; if (!mtx_legal2block()) panic("blockable mtx_lock() of %s when not legal @ %s:%d", m->mtx_description, file, line); /* * Is this the first mutex acquired */ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) goto out; if ((w1 = m1->mtx_witness) == w) { if (w->w_same_squawked || dup_ok(w)) goto out; w->w_same_squawked = 1; printf("acquring duplicate lock of same type: \"%s\"\n", m->mtx_description); printf(" 1st @ %s:%d\n", w->w_file, w->w_line); printf(" 2nd @ %s:%d\n", file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } MPASS(!mtx_owned(&w_mtx)); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); /* * If we have a known higher number just say ok */ if (witness_watch > 1 && w->w_level > w1->w_level) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } if (isitmydescendant(m1->mtx_witness, w)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); goto out; } for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { MPASS(i < 200); w1 = m1->mtx_witness; if (isitmydescendant(w, w1)) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (blessed(w, w1)) goto out; if (m1 == &Giant) { if (w1->w_Giant_squawked) goto out; else w1->w_Giant_squawked = 1; } else { if (w1->w_other_squawked) goto out; else w1->w_other_squawked = 1; } printf("lock order reversal\n"); printf(" 1st %s last acquired @ %s:%d\n", w->w_description, w->w_file, w->w_line); printf(" 2nd %p %s @ %s:%d\n", m1, w1->w_description, w1->w_file, w1->w_line); printf(" 3rd %p %s @ %s:%d\n", m, w->w_description, file, line); #ifdef DDB go_into_ddb = 1; #endif /* DDB */ goto out; } } m1 = LIST_FIRST(&p->p_heldmtx); if (!itismychild(m1->mtx_witness, w)) mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); out: #ifdef DDB if (witness_ddb && go_into_ddb) Debugger("witness_enter"); #endif /* DDB */ w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; /* * If this pays off it likely means that a mutex being witnessed * is acquired in hardclock. Put it in the ignore list. It is * likely not the mutex this assert fails on. */ MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_try_enter(struct mtx *m, int flags, const char *file, int line) { struct proc *p; struct witness *w = m->mtx_witness; if (witness_cold) return; if (panicstr) return; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_try_enter: " "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on" " non-recursive mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) | w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_try_enter: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } w->w_file = file; w->w_line = line; m->mtx_line = line; m->mtx_file = file; p = CURPROC; MPASS(m->mtx_held.le_prev == NULL); LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); } void witness_exit(struct mtx *m, int flags, const char *file, int line) { struct witness *w; if (witness_cold || m->mtx_witness == NULL || panicstr) return; w = m->mtx_witness; if (flags & MTX_SPIN) { if ((m->mtx_flags & MTX_SPIN) == 0) panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @" " %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } mtx_lock_spin_flags(&w_mtx, MTX_QUIET); PCPU_SET(witness_spin_check, PCPU_GET(witness_spin_check) & ~w->w_level); mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return; } if ((m->mtx_flags & MTX_SPIN) != 0) panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", m->mtx_description, file, line); if (mtx_recursed(m)) { if ((m->mtx_flags & MTX_RECURSE) == 0) panic("mutex_exit: recursion on non-recursive" " mutex %s @ %s:%d", m->mtx_description, file, line); return; } if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) panic("switchable mtx_unlock() of %s when not legal @ %s:%d", m->mtx_description, file, line); LIST_REMOVE(m, mtx_held); m->mtx_held.le_prev = NULL; } int witness_sleep(int check_only, struct mtx *mtx, const char *file, int line) { struct mtx *m; struct proc *p; char **sleep; int n = 0; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); p = CURPROC; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { if (m == mtx) continue; for (sleep = sleep_list; *sleep!= NULL; sleep++) if (strcmp(m->mtx_description, *sleep) == 0) goto next; if (n == 0) printf("Whee!\n"); printf("%s:%d: %s with \"%s\" locked from %s:%d\n", file, line, check_only ? "could sleep" : "sleeping", m->mtx_description, m->mtx_witness->w_file, m->mtx_witness->w_line); n++; next: } #ifdef DDB if (witness_ddb && n) Debugger("witness_sleep"); #endif /* DDB */ return (n); } static struct witness * enroll(const char *description, int flag) { int i; struct witness *w, *w1; char **ignore; char **order; if (!witness_watch) return (NULL); for (ignore = ignore_list; *ignore != NULL; ignore++) if (strcmp(description, *ignore) == 0) return (NULL); if (w_inited == 0) { mtx_init(&w_mtx, "witness lock", MTX_SPIN); for (i = 0; i < WITNESS_COUNT; i++) { w = &w_data[i]; witness_free(w); } w_inited = 1; for (order = order_list; *order != NULL; order++) { w = enroll(*order, MTX_DEF); w->w_file = "order list"; for (order++; *order != NULL; order++) { w1 = enroll(*order, MTX_DEF); w1->w_file = "order list"; itismychild(w, w1); w = w1; } } } if ((flag & MTX_SPIN) && witness_skipspin) return (NULL); mtx_lock_spin_flags(&w_mtx, MTX_QUIET); for (w = w_all; w; w = w->w_next) { if (strcmp(description, w->w_description) == 0) { mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); return (w); } } if ((w = witness_get()) == NULL) return (NULL); w->w_next = w_all; w_all = w; w->w_description = description; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); if (flag & MTX_SPIN) { w->w_spin = 1; i = 1; for (order = spin_order_list; *order != NULL; order++) { if (strcmp(description, *order) == 0) break; i <<= 1; } if (*order == NULL) panic("spin lock %s not in order list", description); w->w_level = i; } return (w); } static int itismychild(struct witness *parent, struct witness *child) { static int recursed; /* * Insert "child" after "parent" */ while (parent->w_morechildren) parent = parent->w_morechildren; if (parent->w_childcnt == WITNESS_NCHILDREN) { if ((parent->w_morechildren = witness_get()) == NULL) return (1); parent = parent->w_morechildren; } MPASS(child != NULL); parent->w_children[parent->w_childcnt++] = child; /* * now prune whole tree */ if (recursed) return (0); recursed = 1; for (child = w_all; child != NULL; child = child->w_next) { for (parent = w_all; parent != NULL; parent = parent->w_next) { if (!isitmychild(parent, child)) continue; removechild(parent, child); if (isitmydescendant(parent, child)) continue; itismychild(parent, child); } } recursed = 0; witness_levelall(); return (0); } static void removechild(struct witness *parent, struct witness *child) { struct witness *w, *w1; int i; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) if (w->w_children[i] == child) goto found; return; found: for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) continue; w->w_children[i] = w1->w_children[--w1->w_childcnt]; MPASS(w->w_children[i] != NULL); if (w1->w_childcnt != 0) return; if (w1 == parent) return; for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) continue; w->w_morechildren = 0; witness_free(w1); } static int isitmychild(struct witness *parent, struct witness *child) { struct witness *w; int i; for (w = parent; w != NULL; w = w->w_morechildren) { for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } } return (0); } static int isitmydescendant(struct witness *parent, struct witness *child) { struct witness *w; int i; int j; for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { MPASS(j < 1000); for (i = 0; i < w->w_childcnt; i++) { if (w->w_children[i] == child) return (1); } for (i = 0; i < w->w_childcnt; i++) { if (isitmydescendant(w->w_children[i], child)) return (1); } } return (0); } void witness_levelall (void) { struct witness *w, *w1; for (w = w_all; w; w = w->w_next) if (!(w->w_spin)) w->w_level = 0; for (w = w_all; w; w = w->w_next) { if (w->w_spin) continue; for (w1 = w_all; w1; w1 = w1->w_next) { if (isitmychild(w1, w)) break; } if (w1 != NULL) continue; witness_leveldescendents(w, 0); } } static void witness_leveldescendents(struct witness *parent, int level) { int i; struct witness *w; if (parent->w_level < level) parent->w_level = level; level++; for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_leveldescendents(w->w_children[i], level); } static void witness_displaydescendants(void(*prnt)(const char *fmt, ...), struct witness *parent) { struct witness *w; int i; int level; level = parent->w_spin ? ffs(parent->w_level) : parent->w_level; prnt("%d", level); if (level < 10) prnt(" "); for (i = 0; i < level; i++) prnt(" "); prnt("%s", parent->w_description); if (parent->w_file != NULL) prnt(" -- last acquired @ %s:%d\n", parent->w_file, parent->w_line); for (w = parent; w != NULL; w = w->w_morechildren) for (i = 0; i < w->w_childcnt; i++) witness_displaydescendants(prnt, w->w_children[i]); } static int dup_ok(struct witness *w) { char **dup; for (dup = dup_list; *dup!= NULL; dup++) if (strcmp(w->w_description, *dup) == 0) return (1); return (0); } static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < blessed_count; i++) { b = &blessed_list[i]; if (strcmp(w1->w_description, b->b_lock1) == 0) { if (strcmp(w2->w_description, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_description, b->b_lock2) == 0) if (strcmp(w2->w_description, b->b_lock1) == 0) return (1); } return (0); } static struct witness * witness_get() { struct witness *w; if ((w = w_free) == NULL) { witness_dead = 1; mtx_unlock_spin_flags(&w_mtx, MTX_QUIET); printf("witness exhausted\n"); return (NULL); } w_free = w->w_next; bzero(w, sizeof(*w)); return (w); } static void witness_free(struct witness *w) { w->w_next = w_free; w_free = w; } int witness_list(struct proc *p) { struct mtx *m; int nheld; KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); nheld = 0; LIST_FOREACH(m, &p->p_heldmtx, mtx_held) { printf("\t\"%s\" (%p) locked at %s:%d\n", m->mtx_description, m, m->mtx_witness->w_file, m->mtx_witness->w_line); nheld++; } return (nheld); } #ifdef DDB DB_SHOW_COMMAND(mutexes, db_witness_list) { witness_list(CURPROC); } DB_SHOW_COMMAND(witness, db_witness_display) { witness_display(db_printf); } #endif void witness_save(struct mtx *m, const char **filep, int *linep) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; *filep = m->mtx_witness->w_file; *linep = m->mtx_witness->w_line; } void witness_restore(struct mtx *m, const char *file, int line) { KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__)); if (m->mtx_witness == NULL) return; m->mtx_witness->w_file = file; m->mtx_witness->w_line = line; } #endif /* WITNESS */ Index: head/sys/posix4/ksched.c =================================================================== --- head/sys/posix4/ksched.c (revision 72375) +++ head/sys/posix4/ksched.c (revision 72376) @@ -1,264 +1,269 @@ /* * Copyright (c) 1996, 1997 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* ksched: Soft real time scheduling based on "rtprio". */ #include #include #include #include #include /* For need_resched */ #include /* For need_resched */ #include /* ksched: Real-time extension to support POSIX priority scheduling. */ struct ksched { struct timespec rr_interval; }; int ksched_attach(struct ksched **p) { struct ksched *ksched= p31b_malloc(sizeof(*ksched)); ksched->rr_interval.tv_sec = 0; ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval(); *p = ksched; return 0; } int ksched_detach(struct ksched *p) { p31b_free(p); return 0; } /* * XXX About priorities * * POSIX 1003.1b requires that numerically higher priorities be of * higher priority. It also permits sched_setparam to be * implementation defined for SCHED_OTHER. I don't like * the notion of inverted priorites for normal processes when * you can use "setpriority" for that. * * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL. */ /* Macros to convert between the unix (lower numerically is higher priority) * and POSIX 1003.1b (higher numerically is higher priority) */ #define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P)) #define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P)) /* These improve readability a bit for me: */ #define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX) #define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN) static __inline int getscheduler(register_t *ret, struct ksched *ksched, struct proc *p) { + struct rtprio rtp; int e = 0; - switch (p->p_rtprio.type) + pri_to_rtp(&p->p_pri, &rtp); + switch (rtp.type) { case RTP_PRIO_FIFO: *ret = SCHED_FIFO; break; case RTP_PRIO_REALTIME: *ret = SCHED_RR; break; default: *ret = SCHED_OTHER; break; } return e; } int ksched_setparam(register_t *ret, struct ksched *ksched, struct proc *p, const struct sched_param *param) { register_t policy; int e; e = getscheduler(&policy, ksched, p); if (e == 0) { if (policy == SCHED_OTHER) e = EINVAL; else e = ksched_setscheduler(ret, ksched, p, policy, param); } return e; } int ksched_getparam(register_t *ret, struct ksched *ksched, struct proc *p, struct sched_param *param) { - if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) - param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio); + struct rtprio rtp; + pri_to_rtp(&p->p_pri, &rtp); + if (RTP_PRIO_IS_REALTIME(rtp.type)) + param->sched_priority = rtpprio_to_p4prio(rtp.prio); + return 0; } /* * XXX The priority and scheduler modifications should * be moved into published interfaces in kern/kern_sync. * * The permissions to modify process p were checked in "p31b_proc()". * */ int ksched_setscheduler(register_t *ret, struct ksched *ksched, struct proc *p, int policy, const struct sched_param *param) { int e = 0; struct rtprio rtp; switch(policy) { case SCHED_RR: case SCHED_FIFO: if (param->sched_priority >= P1B_PRIO_MIN && param->sched_priority <= P1B_PRIO_MAX) { rtp.prio = p4prio_to_rtpprio(param->sched_priority); rtp.type = (policy == SCHED_FIFO) ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; - p->p_rtprio = rtp; + rtp_to_pri(&rtp, &p->p_pri); need_resched(); } else e = EPERM; break; case SCHED_OTHER: { rtp.type = RTP_PRIO_NORMAL; rtp.prio = p4prio_to_rtpprio(param->sched_priority); - p->p_rtprio = rtp; + rtp_to_pri(&rtp, &p->p_pri); /* XXX Simply revert to whatever we had for last * normal scheduler priorities. * This puts a requirement * on the scheduling code: You must leave the * scheduling info alone. */ need_resched(); } break; } return e; } int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct proc *p) { return getscheduler(ret, ksched, p); } /* ksched_yield: Yield the CPU. */ int ksched_yield(register_t *ret, struct ksched *ksched) { need_resched(); return 0; } int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy) { int e = 0; switch (policy) { case SCHED_FIFO: case SCHED_RR: *ret = RTP_PRIO_MAX; break; case SCHED_OTHER: *ret = PRIO_MAX; break; default: e = EINVAL; } return e; } int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy) { int e = 0; switch (policy) { case SCHED_FIFO: case SCHED_RR: *ret = P1B_PRIO_MIN; break; case SCHED_OTHER: *ret = PRIO_MIN; break; default: e = EINVAL; } return e; } int ksched_rr_get_interval(register_t *ret, struct ksched *ksched, struct proc *p, struct timespec *timespec) { *timespec = ksched->rr_interval; return 0; } Index: head/sys/sys/ktr.h =================================================================== --- head/sys/sys/ktr.h (revision 72375) +++ head/sys/sys/ktr.h (revision 72376) @@ -1,212 +1,213 @@ /*- * Copyright (c) 1996 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: ktr.h,v 1.10.2.7 2000/03/16 21:44:42 cp Exp $ * $FreeBSD$ */ /* * Wraparound kernel trace buffer support. */ #ifndef _SYS_KTR_H_ #define _SYS_KTR_H_ /* Requires sys/types.h, sys/time.h, machine/atomic.h, and machine/cpufunc.h */ #include #include /* * Trace classes */ #define KTR_GEN 0x00000001 /* General (TR) */ #define KTR_NET 0x00000002 /* Network */ #define KTR_DEV 0x00000004 /* Device driver */ #define KTR_LOCK 0x00000008 /* MP locking */ #define KTR_SMP 0x00000010 /* MP general */ #define KTR_FS 0x00000020 /* Filesystem */ #define KTR_PMAP 0x00000040 /* Pmap tracing */ #define KTR_MALLOC 0x00000080 /* Malloc tracing */ #define KTR_TRAP 0x00000100 /* Trap processing */ #define KTR_INTR 0x00000200 /* Interrupt tracing */ #define KTR_SIG 0x00000400 /* Signal processing */ #define KTR_CLK 0x00000800 /* hardclock verbose */ #define KTR_PROC 0x00001000 /* Process scheduling */ #define KTR_SYSC 0x00002000 /* System call */ #define KTR_INIT 0x00004000 /* System initialization */ #define KTR_KGDB 0x00008000 /* Trace kgdb internals */ #define KTR_IO 0x00010000 /* Upper I/O */ #define KTR_LOCKMGR 0x00020000 #define KTR_NFS 0x00040000 /* The obvious */ #define KTR_VOP 0x00080000 /* The obvious */ #define KTR_VM 0x00100000 /* The virtual memory system */ #define KTR_IDLELOOP 0x00200000 /* checks done in the idle process */ +#define KTR_RUNQ 0x00400000 /* Run queue */ /* * Trace classes which can be assigned to particular use at compile time * These must remain in high 22 as some assembly code counts on it */ #define KTR_CT1 0x010000000 #define KTR_CT2 0x020000000 #define KTR_CT3 0x040000000 #define KTR_CT4 0x080000000 #define KTR_CT5 0x100000000 #define KTR_CT6 0x200000000 #define KTR_CT7 0x400000000 #define KTR_CT8 0x800000000 /* Trace classes to compile in */ #ifndef KTR_COMPILE #define KTR_COMPILE (KTR_GEN) #endif #ifndef LOCORE #include struct ktr_entry { struct timespec ktr_tv; #ifdef KTR_EXTEND #ifndef KTRDESCSIZE #define KTRDESCSIZE 80 #endif #ifndef KTRFILENAMESIZE #define KTRFILENAMESIZE 32 #endif char ktr_desc [KTRDESCSIZE]; char ktr_filename [KTRFILENAMESIZE]; int ktr_line; int ktr_cpu; #else char *ktr_desc; u_long ktr_parm1; u_long ktr_parm2; u_long ktr_parm3; u_long ktr_parm4; u_long ktr_parm5; #endif }; /* These variables are used by gdb to analyse the output */ extern int ktr_extend; extern int ktr_cpumask; extern int ktr_mask; extern int ktr_entries; extern int ktr_verbose; extern volatile int ktr_idx; extern struct ktr_entry ktr_buf[]; #endif /* !LOCORE */ #ifdef KTR #ifndef KTR_ENTRIES #define KTR_ENTRIES 1024 #endif #ifdef KTR_EXTEND void ktr_tracepoint(u_int mask, char *filename, u_int line, char *format, ...); #else void ktr_tracepoint(u_int mask, char *format, u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5); #endif #ifdef KTR_EXTEND #define CTR(m, format, args...) do { \ if (KTR_COMPILE & (m)) \ ktr_tracepoint((m), __FILE__, __LINE__, format , ##args); \ } while(0) #define CTR0(m, format) CTR(m, format) #define CTR1(m, format, p1) CTR(m, format, p1) #define CTR2(m, format, p1, p2) CTR(m, format, p1, p2) #define CTR3(m, format, p1, p2, p3) CTR(m, format, p1, p2, p3) #define CTR4(m, format, p1, p2, p3, p4) CTR(m, format, p1, p2, p3, p4) #define CTR5(m, format, p1, p2, p3, p4, p5) \ CTR(m, format, p1, p2, p3, p4, p5) #else /* not extended */ #define CTR5(m, format, p1, p2, p3, p4, p5) do { \ if (KTR_COMPILE & (m)) \ ktr_tracepoint((m), format, (u_long)p1, (u_long)p2, \ (u_long)p3, (u_long)p4, (u_long)p5); \ } while(0) #define CTR0(m, format) CTR5(m, format, 0, 0, 0, 0, 0) #define CTR1(m, format, p1) CTR5(m, format, p1, 0, 0, 0, 0) #define CTR2(m, format, p1, p2) CTR5(m, format, p1, p2, 0, 0, 0) #define CTR3(m, format, p1, p2, p3) CTR5(m, format, p1, p2, p3, 0, 0) #define CTR4(m, format, p1, p2, p3, p4) CTR5(m, format, p1, p2, p3, p4, 0) #endif /* KTR_EXTEND */ #else /* KTR */ #undef KTR_COMPILE #define KTR_COMPILE 0 #define CTR0(m, d) #define CTR1(m, d, p1) #define CTR2(m, d, p1, p2) #define CTR3(m, d, p1, p2, p3) #define CTR4(m, d, p1, p2, p3, p4) #define CTR5(m, d, p1, p2, p3, p4, p5) /* XXX vvvvvvvv ??? */ #define SEG_ATR(d,s) #define SEG_ATR_DESC(d,s) #define ATR(d) #define CATR(f,d,n) #define CATRD(f,d,n) #endif /* KTR */ #define TR0(d) CTR0(KTR_GEN, d) #define TR1(d, p1) CTR1(KTR_GEN, d, p1) #define TR2(d, p1, p2) CTR2(KTR_GEN, d, p1, p2) #define TR3(d, p1, p2, p3) CTR3(KTR_GEN, d, p1, p2, p3) #define TR4(d, p1, p2, p3, p4) CTR4(KTR_GEN, d, p1, p2, p3, p4) #define TR5(d, p1, p2, p3, p4, p5) CTR5(KTR_GEN, d, p1, p2, p3, p4, p5) /* * Trace initialization events, similar to CTR with KTR_INIT, but * completely ifdef'ed out if KTR_INIT isn't in KTR_COMPILE (to * save string space, the compiler doesn't optimize out strings * for the conditional ones above). */ #if (KTR_COMPILE & KTR_INIT) != 0 #define ITR0(d) CTR0(KTR_INIT, d) #define ITR1(d, p1) CTR1(KTR_INIT, d, p1) #define ITR2(d, p1, p2) CTR2(KTR_INIT, d, p1, p2) #define ITR3(d, p1, p2, p3) CTR3(KTR_INIT, d, p1, p2, p3) #define ITR4(d, p1, p2, p3, p4) CTR4(KTR_INIT, d, p1, p2, p3, p4) #define ITR5(d, p1, p2, p3, p4, p5) CTR5(KTR_INIT, d, p1, p2, p3, p4, p5) #else #define ITR0(d) #define ITR1(d, p1) #define ITR2(d, p1, p2) #define ITR3(d, p1, p2, p3) #define ITR4(d, p1, p2, p3, p4) #define ITR5(d, p1, p2, p3, p4, p5) #endif #endif /* !_SYS_KTR_H_ */ Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 72375) +++ head/sys/sys/param.h (revision 72376) @@ -1,252 +1,235 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 #undef __FreeBSD_version #define __FreeBSD_version 500016 /* Master, propagated to newvers */ #ifndef NULL #define NULL 0 #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) * MAXLOGNAME should be == UT_NAMESIZE+1 (see ) */ #include #define MAXCOMLEN 16 /* max command name remembered */ #define MAXINTERP 32 /* max interpreter file name length */ #define MAXLOGNAME 17 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS NGROUPS_MAX /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 15 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #include +#include #define FALSE 0 #define TRUE 1 #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif - -/* - * Priorities. Note that with 32 run queues, differences less than 4 are - * insignificant. - */ -#define PSWP 0 -#define PVM 4 -#define PINOD 8 -#define PRIBIO 16 -#define PVFS 20 -#define PZERO 22 /* No longer magic, shouldn't be here. XXX */ -#define PSOCK 24 -#define PWAIT 32 -#define PCONFIG 32 -#define PLOCK 36 -#define PPAUSE 40 -#define PUSER 48 -#define MAXPRI 127 /* Priorities range from 0 through MAXPRI. */ #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #ifdef _KERNEL #define NODEV (dev_t)(-1) /* non-existent device */ #define NOUDEV (udev_t)(-1) /* non-existent device */ #define NOMAJ 256 /* non-existent device */ #else #define NODEV (dev_t)(-1) /* non-existent device */ #endif #define CBLOCK 128 /* Clist block size, must be a power of 2. */ #define CBQSIZE (CBLOCK/NBBY) /* Quote bytes/cblock - can do better. */ /* Data chars/clist. */ #define CBSIZE (CBLOCK - sizeof(struct cblock *) - CBQSIZE) #define CROUND (CBLOCK - 1) /* Clist rounding. */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #define BKVASIZE 16384 /* must be power of 2 */ #define BKVAMASK (BKVASIZE-1) #define MAXFRAG 8 /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) ((a)[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) ((a)[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) ((a)[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) (((a)[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define rounddown(x, y) (((x)/(y))*(y)) #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #ifndef _KERNEL #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #endif /* * Constants for setting the parameters of the kernel memory allocator. * * 2 ** MINBUCKET is the smallest unit of memory that will be * allocated. It must be at least large enough to hold a pointer. * * Units of memory less or equal to MAXALLOCSAVE will permanently * allocate physical memory; requests for these size pieces of * memory are quite fast. Allocations greater than MAXALLOCSAVE must * always allocate and free physical memory; requests for these * size allocations should be done infrequently as they will be slow. * * Constraints: PAGE_SIZE <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14), and * MAXALLOCSIZE must be a power of two. */ #if defined(__alpha__) || defined(__ia64__) #define MINBUCKET 5 /* 5 => min allocation of 32 bytes */ #else #define MINBUCKET 4 /* 4 => min allocation of 16 bytes */ #endif #define MAXALLOCSAVE (2 * PAGE_SIZE) /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Make this available for most of the kernel. There were too many * things that included sys/systm.h just for panic(). */ #ifdef _KERNEL void panic __P((const char *, ...)) __dead2 __printflike(1, 2); #endif #endif /* _SYS_PARAM_H_ */ Index: head/sys/sys/priority.h =================================================================== --- head/sys/sys/priority.h (nonexistent) +++ head/sys/sys/priority.h (revision 72376) @@ -0,0 +1,130 @@ +/* + * Copyright (c) 1994, Henrik Vestergaard Draboel + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by (name). + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_PRIORITY_H_ +#define _SYS_PRIORITY_H_ + +/* + * Process priority specifications. + */ + +/* + * Priority classes. + */ + +#define PRI_ITHD 1 /* Interrupt thread. */ +#define PRI_REALTIME 2 /* Real time process. */ +#define PRI_TIMESHARE 3 /* Time sharing process. */ +#define PRI_IDLE 4 /* Idle process. */ + +/* + * PRI_FIFO is POSIX.1B SCHED_FIFO. + */ + +#define PRI_FIFO_BIT 8 +#define PRI_FIFO (PRI_FIFO_BIT | PRI_REALTIME) + +#define PRI_BASE(P) ((P) & ~PRI_FIFO_BIT) +#define PRI_IS_REALTIME(P) (PRI_BASE(P) == PRI_REALTIME) +#define PRI_NEED_RR(P) ((P) != PRI_FIFO) + +/* + * Priorities. Note that with 64 run queues, differences less than 4 are + * insignificant. + */ + +/* + * Priorities range from 0 to 255, but differences of less then 4 (RQ_PPQ) + * are insignificant. Ranges are as follows: + * + * Interrupt threads: 0 - 63 + * Top half kernel threads: 64 - 127 + * Realtime user threads: 128 - 159 + * Time sharing user threads: 160 - 223 + * Idle user threads: 224 - 255 + * + * XXX If/When the specific interrupt thread and top half thread ranges + * disappear, a larger range can be used for user processes. + */ + +#define PRI_MIN (0) /* Highest priority. */ +#define PRI_MAX (255) /* Lowest priority. */ + +#define PRI_MIN_ITHD (PRI_MIN) +#define PRI_MAX_ITHD (PRI_MIN_KERN - 1) + +#define PI_REALTIME (PRI_MIN_ITHD + 0) +#define PI_AV (PRI_MIN_ITHD + 4) +#define PI_TTYHIGH (PRI_MIN_ITHD + 8) +#define PI_TAPE (PRI_MIN_ITHD + 12) +#define PI_NET (PRI_MIN_ITHD + 16) +#define PI_DISK (PRI_MIN_ITHD + 20) +#define PI_TTYLOW (PRI_MIN_ITHD + 24) +#define PI_DISKLOW (PRI_MIN_ITHD + 28) +#define PI_DULL (PRI_MIN_ITHD + 32) +#define PI_SOFT (PRI_MIN_ITHD + 36) + +#define PRI_MIN_KERN (64) +#define PRI_MAX_KERN (PRI_MIN_REALTIME - 1) + +#define PSWP (PRI_MIN_KERN + 0) +#define PVM (PRI_MIN_KERN + 4) +#define PINOD (PRI_MIN_KERN + 8) +#define PRIBIO (PRI_MIN_KERN + 12) +#define PVFS (PRI_MIN_KERN + 16) +#define PZERO (PRI_MIN_KERN + 20) +#define PSOCK (PRI_MIN_KERN + 24) +#define PWAIT (PRI_MIN_KERN + 28) +#define PCONFIG (PRI_MIN_KERN + 32) +#define PLOCK (PRI_MIN_KERN + 36) +#define PPAUSE (PRI_MIN_KERN + 40) + +#define PRI_MIN_REALTIME (128) +#define PRI_MAX_REALTIME (PRI_MIN_TIMESHARE - 1) + +#define PRI_MIN_TIMESHARE (160) +#define PRI_MAX_TIMESHARE (PRI_MIN_IDLE - 1) + +#define PUSER (PRI_MIN_TIMESHARE) + +#define PRI_MIN_IDLE (224) +#define PRI_MAX_IDLE (PRI_MAX) + +struct priority { + u_char pri_class; /* Scheduling class. */ + u_char pri_level; /* Normal priority level. */ + u_char pri_native; /* Priority before propogation. */ + u_char pri_user; /* User priority based on p_cpu and p_nice. */ +}; + +#endif /* !_SYS_PRIORITY_H_ */ Property changes on: head/sys/sys/priority.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 72375) +++ head/sys/sys/proc.h (revision 72376) @@ -1,577 +1,569 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #include -#include /* For struct rtprio. */ +#include +#include /* XXX */ +#include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #endif #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. */ struct session { int s_count; /* Ref cnt; pgrps in session. */ struct proc *s_leader; /* Session leader. */ struct vnode *s_ttyvp; /* Vnode of controlling terminal. */ struct tty *s_ttyp; /* Controlling terminal. */ pid_t s_sid; /* Session ID. */ /* Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; }; /* * One structure allocated per process group. */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* Hash chain. */ LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members. */ struct session *pg_session; /* Pointer to session. */ struct sigiolst pg_sigiolst; /* List of sigio sources. */ pid_t pg_id; /* Pgrp id. */ int pg_jobc; /* # procs qualifying pgrp for job control */ }; struct procsig { sigset_t ps_sigignore; /* Signals being ignored. */ sigset_t ps_sigcatch; /* Signals being caught by user. */ int ps_flag; struct sigacts *ps_sigacts; /* Signal actions, state. */ int ps_refcnt; }; #define PS_NOCLDWAIT 0x0001 /* No zombies if child dies */ #define PS_NOCLDSTOP 0x0002 /* No SIGCHLD when children stop. */ /* * pasleep structure, used by asleep() syscall to hold requested priority * and timeout values for await(). */ struct pasleep { int as_priority; /* Async priority. */ int as_timo; /* Async timeout. */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[0]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never chagnes * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by sched_lock mtx * k - either by curproc or a lock which prevents the lock from * going away, such as (d,e) * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * * If the locking identifier is followed by a plus '+', then the specified * member follows these special rules: * - It is only written to by the current process. * - It can be read by the current process and other processes. * Thus, the locking rules for it are slightly different, and allow us to * optimize the case where a process reads its own such value: * - Writes to this member are locked. * - Reads of this value by other processes are locked. * - Reads of this value by the current process need not be locked. */ struct ithd; struct proc { TAILQ_ENTRY(proc) p_procq; /* (j) Run/mutex queue. */ TAILQ_ENTRY(proc) p_slpq; /* (j) Sleep queue. */ LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ /* substructures: */ struct pcred *p_cred; /* (c+) Process owner's identity. */ struct filedesc *p_fd; /* (b) Ptr to open files structure. */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (m) Process limits. */ struct vm_object *p_upages_obj;/* (a) Upages object. */ struct procsig *p_procsig; /* (c) Signal actions, state (CPU). */ #define p_sigacts p_procsig->ps_sigacts #define p_sigignore p_procsig->ps_sigignore #define p_sigcatch p_procsig->ps_sigcatch #define p_ucred p_cred->pc_ucred #define p_rlimit p_limit->pl_rlimit int p_flag; /* (c) P_* flags. */ int p_sflag; /* (j) PS_* flags. */ int p_intr_nesting_level; /* (k) Interrupt recursion. */ char p_stat; /* (j) S* process status. */ char p_pad1[3]; pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (c) List of processes in pgrp. */ struct proc *p_pptr; /* (e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c) Save parent pid during ptrace. XXX */ int p_dupfd; /* (c) Sideways ret value from fdopen. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ /* scheduling */ u_int p_estcpu; /* (j) Time averaged value of p_cpticks. */ int p_cpticks; /* (j) Ticks of cpu time. */ fixpt_t p_pctcpu; /* (j) %cpu during p_swtime. */ struct callout p_slpcallout; /* (h) Callout for sleep. */ void *p_wchan; /* (j) Sleep address. */ const char *p_wmesg; /* (j) Reason for sleep. */ u_int p_swtime; /* (j) Time swapped in or out. */ u_int p_slptime; /* (j) Time since last blocked. */ struct callout p_itcallout; /* (h) Interval timer callout. */ struct itimerval p_realtimer; /* (h?/k?) Alarm timer. */ u_int64_t p_runtime; /* (j) Real time in microsec. */ u_int64_t p_uu; /* (j) Previous user time in microsec. */ u_int64_t p_su; /* (j) Previous system time in microsec. */ u_int64_t p_iu; /* (j) Previous interrupt time in microsec. */ u_int64_t p_uticks; /* (j) Statclock hits in user mode. */ u_int64_t p_sticks; /* (j) Statclock hits in system mode. */ u_int64_t p_iticks; /* (j) Statclock hits processing intr. */ int p_traceflag; /* (j?) Kernel trace points. */ struct vnode *p_tracep; /* (j?) Trace to vnode. */ sigset_t p_siglist; /* (c) Signals arrived but not delivered. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ char p_lock; /* (c) Process lock (prevent swap) count. */ struct mtx p_mtx; /* (k) Lock for this struct. */ u_char p_oncpu; /* (j) Which cpu we are on. */ u_char p_lastcpu; /* (j) Last cpu we were on. */ char p_rqindex; /* (j) Run queue index. */ short p_locks; /* (*) DEBUG: lockmgr count of held locks */ u_int p_stops; /* (c) Procfs event bitmask. */ u_int p_stype; /* (c) Procfs stop event type. */ char p_step; /* (c) Procfs stop *once* flag. */ u_char p_pfsflags; /* (c) Procfs flags. */ char p_pad3[2]; /* Alignment. */ register_t p_retval[2]; /* (k) Syscall aux returns. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ sigset_t p_oldsigmask; /* (c) Saved mask from before sigpause. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ struct klist p_klist; /* (c) Knotes attached to this process. */ LIST_HEAD(, mtx) p_heldmtx; /* (j) For debugging code. */ struct mtx *p_blocked; /* (j) Mutex process is blocked on. */ const char *p_mtxname; /* (j) Name of mutex blocked on. */ LIST_HEAD(, mtx) p_contested; /* (j) Contested locks. */ /* End area that is zeroed on creation. */ #define p_endzero p_startcopy /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_sigmask sigset_t p_sigmask; /* (c) Current signal mask. */ stack_t p_sigstk; /* (c) Stack pointer and on-stack flag. */ int p_magic; /* (b) Magic number. */ - u_char p_priority; /* (j) Process priority. */ - u_char p_usrpri; /* (j) User priority based on p_cpu and p_nice. */ - u_char p_nativepri; /* (j) Priority before propagation. */ + struct priority p_pri; /* (j) Process priority. */ char p_nice; /* (j?/k?) Process "nice" value. */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (e?/c?) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) System call dispatch information. */ - struct rtprio p_rtprio; /* (j) Realtime priority. */ struct prison *p_prison; /* (b?) jail(4). */ struct pargs *p_args; /* (b?) Process arguments. */ /* End area that is copied on creation. */ #define p_endcopy p_addr struct user *p_addr; /* (k) Kernel virtual addr of u-area (CPU). */ struct mdproc p_md; /* (k) Any machine-dependent fields. */ u_short p_xstat; /* (c) Exit status for wait; also stop sig. */ u_short p_acflag; /* (c) Accounting flags. */ struct rusage *p_ru; /* (a) Exit information. XXX */ void *p_aioinfo; /* (c) ASYNC I/O info. */ struct proc *p_peers; /* (c) */ struct proc *p_leader; /* (c) */ struct pasleep p_asleep; /* (k) Used by asleep()/await(). */ void *p_emuldata; /* (c) Emulator state data. */ struct ithd *p_ithd; /* (b) For interrupt threads only. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id /* Status values (p_stat). */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SMTX 7 /* Blocked on a mutex. */ /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread. */ #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_SELECT 0x00040 /* Selecting; wakeup/waiting danger. */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Debugging process has waited for child. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ /* Should be moved to machine-dependent areas. */ #define P_BUFEXHAUST 0x100000 /* Dirty buffers flush is in progress. */ #define P_COWINPROGRESS 0x400000 /* Snapshot copy-on-write in progress. */ #define P_DEADLKTREAT 0x800000 /* Lock aquisition - deadlock treatment. */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_OLDMASK 0x2000000 /* Need to restore mask after suspend. */ #define P_ALTSTACK 0x4000000 /* Have alternate signal stack. */ /* These flags are kept in p_sflag and are protected with sched_lock. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ #define PS_OWEUPC 0x00002 /* Owe process an addupc() call at next ast. */ #define PS_PROFIL 0x00004 /* Has started profiling. */ #define PS_SINTR 0x00008 /* Sleep is interruptible. */ #define PS_TIMEOUT 0x00010 /* Timing out during sleep. */ #define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */ #define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */ #define PS_CVWAITQ 0x00080 /* Proces is on a cv_waitq (not slpq). */ #define PS_SWAPINREQ 0x00100 /* Swapin request due to wakeup. */ #define PS_SWAPPING 0x00200 /* Process is being swapped. */ #define PS_ASTPENDING 0x00400 /* Process has a pending ast. */ #define PS_NEEDRESCHED 0x00800 /* Process needs to yield. */ #define P_MAGIC 0xbeefface #define P_CAN_SEE 1 #define P_CAN_KILL 2 #define P_CAN_SCHED 3 #define P_CAN_DEBUG 4 /* * MOVE TO ucred.h? * * Shareable process credentials (always resident). This includes a reference * to the current user credentials as well as real and saved ids that may be * used to change ids. */ struct pcred { struct ucred *pc_ucred; /* Current credentials. */ uid_t p_ruid; /* Real user id. */ uid_t p_svuid; /* Saved effective user id. */ gid_t p_rgid; /* Real group id. */ gid_t p_svgid; /* Saved effective group id. */ int p_refcnt; /* Number of references. */ struct uidinfo *p_uidinfo; /* Per uid resource consumption. */ }; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif static __inline int sigonstack(size_t sp) { register struct proc *p = curproc; return ((p->p_flag & P_ALTSTACK) ? #if defined(COMPAT_43) || defined(COMPAT_SUNOS) ((p->p_sigstk.ss_size == 0) ? (p->p_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size)) #else ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size) #endif : 0); } /* * Preempt the current process if in interrupt from user mode, * or after the current trap/syscall if in system mode. */ #define need_resched() do { \ mtx_assert(&sched_lock, MA_OWNED); \ curproc->p_sflag |= PS_NEEDRESCHED; \ } while (0) #define resched_wanted() (curproc->p_sflag & PS_NEEDRESCHED) #define clear_resched() do { \ mtx_assert(&sched_lock, MA_OWNED); \ curproc->p_sflag &= ~PS_NEEDRESCHED; \ } while (0) /* * Notify the current process (p) that it has a signal pending, * process as soon as possible. */ #define aston() signotify(CURPROC) #define signotify(p) do { \ mtx_assert(&sched_lock, MA_OWNED); \ (p)->p_sflag |= PS_ASTPENDING; \ } while (0) #define astpending() (curproc->p_sflag & PS_ASTPENDING) #define astoff() do { \ mtx_assert(&sched_lock, MA_OWNED); \ CURPROC->p_sflag &= ~PS_ASTPENDING; \ } while (0) /* Handy macro to determine if p1 can mangle p2. */ #define PRISON_CHECK(p1, p2) \ ((p1)->p_prison == NULL || (p1)->p_prison == (p2)->p_prison) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) { \ if (--(s)->s_count == 0) \ FREE(s, M_SESSION); \ } /* STOPEVENT() is MP safe. */ #define STOPEVENT(p, e, v) do { \ PROC_LOCK(p); \ if ((p)->p_stops & (e)) { \ stopevent((p), (e), (v)); \ } \ PROC_UNLOCK(p); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) /* Lock and unlock the proc lists. */ #define ALLPROC_LOCK(how) \ lockmgr(&allproc_lock, (how), NULL, CURPROC) #define AP_SHARED LK_SHARED #define AP_EXCLUSIVE LK_EXCLUSIVE #define AP_RELEASE LK_RELEASE /* Lock and unlock the proc child and sibling lists. */ #define PROCTREE_LOCK(how) \ lockmgr(&proctree_lock, (how), NULL, CURPROC) #define PROCTREE_ASSERT(what) \ LOCKMGR_ASSERT(&proctree_lock, (what), CURPROC) #define PT_SHARED LK_SHARED #define PT_EXCLUSIVE LK_EXCLUSIVE #define PT_RELEASE LK_RELEASE /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ if ((p)->p_lock++ == 0) \ faultin(p); \ PROC_UNLOCK(p); \ } while (0) #define PRELE(p) do { \ PROC_LOCK(p); \ (--(p)->p_lock); \ PROC_UNLOCK(p); \ } while (0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct lock allproc_lock; extern struct lock proctree_lock; extern struct proc proc0; /* Process slot for swapper. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; extern int ps_argsopen; extern int ps_showallprocs; extern int sched_quantum; /* Scheduling quantum in ticks. */ LIST_HEAD(proclist, proc); +TAILQ_HEAD(procqueue, proc); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct proc *updateproc; /* Process slot for syncer (sic). */ -#define NQS 32 /* 32 run queues. */ - -TAILQ_HEAD(rq, proc); -extern struct rq itqueues[]; -extern struct rq rtqueues[]; -extern struct rq queues[]; -extern struct rq idqueues[]; extern struct vm_zone *proc_zone; /* * XXX macros for scheduler. Shouldn't be here, but currently needed for * bounding the dubious p_estcpu inheritance in wait1(). * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ - PPQ) + INVERSE_ESTCPU_WEIGHT - 1) + RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #define NICE_WEIGHT 1 /* Priorities per nice level. */ -#define PPQ (128 / NQS) /* Priorities per queue. */ struct mtx; struct trapframe; struct proc *pfind __P((pid_t)); /* Find process by id. */ struct pgrp *pgfind __P((pid_t)); /* Find process group by id. */ struct proc *zpfind __P((pid_t)); /* Find zombie process by id. */ struct proc *chooseproc __P((void)); int enterpgrp __P((struct proc *p, pid_t pgid, int mksess)); void faultin __P((struct proc *p)); void fixjobc __P((struct proc *p, struct pgrp *pgrp, int entering)); int fork1 __P((struct proc *, int, struct proc **)); void fork_exit __P((void (*)(void *, struct trapframe *), void *, struct trapframe *)); void fork_return __P((struct proc *, struct trapframe *)); int inferior __P((struct proc *p)); int leavepgrp __P((struct proc *p)); void mi_switch __P((void)); int p_can __P((const struct proc *p1, const struct proc *p2, int operation, int *privused)); int p_trespass __P((struct proc *p1, struct proc *p2)); void procinit __P((void)); void proc_reparent __P((struct proc *child, struct proc *newparent)); -u_int32_t procrunnable __P((void)); +int procrunnable __P((void)); void remrunqueue __P((struct proc *)); void resetpriority __P((struct proc *)); int roundrobin_interval __P((void)); void schedclock __P((struct proc *)); void setrunnable __P((struct proc *)); void setrunqueue __P((struct proc *)); void setsugid __P((struct proc *p)); void sleepinit __P((void)); void stopevent __P((struct proc *, u_int, u_int)); void cpu_idle __P((void)); void cpu_switch __P((void)); void cpu_throw __P((void)) __dead2; void unsleep __P((struct proc *)); void updatepri __P((struct proc *)); void userret __P((struct proc *, struct trapframe *, u_quad_t)); void maybe_resched __P((struct proc *)); void cpu_exit __P((struct proc *)) __dead2; void exit1 __P((struct proc *, int)) __dead2; void cpu_fork __P((struct proc *, struct proc *, int)); void cpu_set_fork_handler __P((struct proc *, void (*)(void *), void *)); int trace_req __P((struct proc *)); void cpu_wait __P((struct proc *)); int cpu_coredump __P((struct proc *, struct vnode *, struct ucred *)); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/rtprio.h =================================================================== --- head/sys/sys/rtprio.h (revision 72375) +++ head/sys/sys/rtprio.h (revision 72376) @@ -1,103 +1,90 @@ /* * Copyright (c) 1994, Henrik Vestergaard Draboel * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by (name). * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_RTPRIO_H_ #define _SYS_RTPRIO_H_ +#include + /* * Process realtime-priority specifications to rtprio. */ /* priority types. Start at 1 to catch uninitialized fields. */ -#define RTP_PRIO_ITHREAD 1 /* interrupt thread */ -#define RTP_PRIO_REALTIME 2 /* real time process */ -#define RTP_PRIO_NORMAL 3 /* time sharing process */ -#define RTP_PRIO_IDLE 4 /* idle process */ +#define RTP_PRIO_REALTIME PRI_REALTIME /* real time process */ +#define RTP_PRIO_NORMAL PRI_TIMESHARE /* time sharing process */ +#define RTP_PRIO_IDLE PRI_IDLE /* idle process */ /* RTP_PRIO_FIFO is POSIX.1B SCHED_FIFO. */ -#define RTP_PRIO_FIFO_BIT 4 -#define RTP_PRIO_FIFO (RTP_PRIO_REALTIME | RTP_PRIO_FIFO_BIT) -#define RTP_PRIO_BASE(P) ((P) & ~RTP_PRIO_FIFO_BIT) -#define RTP_PRIO_IS_REALTIME(P) (RTP_PRIO_BASE(P) == RTP_PRIO_REALTIME) -#define RTP_PRIO_NEED_RR(P) ((P) != RTP_PRIO_FIFO) +#define RTP_PRIO_FIFO_BIT PRI_FIFO_BIT +#define RTP_PRIO_FIFO PRI_FIFO +#define RTP_PRIO_BASE(P) PRI_BASE(P) +#define RTP_PRIO_IS_REALTIME(P) PRI_IS_REALTIME(P) +#define RTP_PRIO_NEED_RR(P) PRI_NEED_RR(P) /* priority range */ #define RTP_PRIO_MIN 0 /* Highest priority */ #define RTP_PRIO_MAX 31 /* Lowest priority */ /* * rtprio() syscall functions */ #define RTP_LOOKUP 0 #define RTP_SET 1 #ifndef LOCORE /* - * Scheduling class information. This is strictly speaking not only - * for real-time processes. We should replace it with two variables: - * class and priority. At the moment we use prio here for real-time - * and interrupt processes, and for others we use proc.p_pri. FIXME. + * Scheduling class information. */ struct rtprio { u_short type; /* scheduling class */ u_short prio; }; -#endif -/* - * Interrupt thread priorities, after BSD/OS. - */ -#define PI_REALTIME 1 /* very high priority (clock) */ -#define PI_AV 2 /* Audio/video devices */ -#define PI_TTYHIGH 3 /* High priority tty's (small FIFOs) */ -#define PI_TAPE 4 /* Tape devices (high for streaming) */ -#define PI_NET 5 /* Network interfaces */ -#define PI_DISK 6 /* Disks and SCSI */ -#define PI_TTYLOW 7 /* Ttys with big buffers */ -#define PI_DISKLOW 8 /* Disks that do programmed I/O */ -#define PI_DULL 9 /* We don't know or care */ - -/* Soft interrupt threads */ -#define PI_SOFT 15 /* All soft interrupts */ +#ifdef _KERNEL +int rtp_to_pri(struct rtprio *, struct priority *); +void pri_to_rtp(struct priority *, struct rtprio *); +#endif +#endif #ifndef _KERNEL #include __BEGIN_DECLS int rtprio __P((int, pid_t, struct rtprio *)); __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_RTPRIO_H_ */ Index: head/sys/sys/runq.h =================================================================== --- head/sys/sys/runq.h (nonexistent) +++ head/sys/sys/runq.h (revision 72376) @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2001 Jake Burkholder + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RUNQ_H_ +#define _RUNQ_H_ + +/* + * Run queue parameters. + */ + +#define RQ_NQS (64) /* Number of run queues. */ +#define RQ_PPQ (4) /* Priorities per queue. */ + +#define RQB_LEN (2) /* Number of priority status words. */ +#define RQB_L2BPW (5) /* Log2(sizeof(rqb_word_t) * NBBY)). */ +#define RQB_BPW (1<> RQB_L2BPW) +#define RQB_FFS(word) (ffs(word)) + +/* + * Type of run queue status word. + */ +typedef u_int32_t rqb_word_t; + +/* + * Head of run queues. + */ +TAILQ_HEAD(rqhead, proc); + +/* + * Bit array which maintains the status of a run queue. When a queue is + * non-empty the bit corresponding to the queue number will be set. + */ +struct rqbits { + rqb_word_t rqb_bits[RQB_LEN]; +}; + +/* + * Run queue structure. Contains an array of run queues on which processes + * are placed, and a structure to maintain the status of each queue. + */ +struct runq { + struct rqbits rq_status; + struct rqhead rq_queues[RQ_NQS]; +}; + +void runq_add(struct runq *, struct proc *); +int runq_check(struct runq *); +struct proc *runq_choose(struct runq *); +void runq_init(struct runq *); +void runq_remove(struct runq *, struct proc *); + +#endif Property changes on: head/sys/sys/runq.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/sys/systm.h =================================================================== --- head/sys/sys/systm.h (revision 72375) +++ head/sys/sys/systm.h (revision 72376) @@ -1,284 +1,282 @@ /*- * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 * $FreeBSD$ */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include extern int securelevel; /* system security level (see init(8)) */ extern int cold; /* nonzero if we are doing a cold boot */ extern const char *panicstr; /* panic message */ extern int dumping; /* system is dumping */ extern char version[]; /* system version */ extern char copyright[]; /* system copyright */ extern int nswap; /* size of swap space */ extern int selwait; /* select timeout address */ -extern u_char curpriority; /* priority of current process */ - extern int physmem; /* physical memory */ extern dev_t dumpdev; /* dump device */ extern long dumplo; /* offset into dumpdev */ extern dev_t rootdev; /* root device */ extern dev_t rootdevs[2]; /* possible root devices */ extern char *rootdevnames[2]; /* names of possible root devices */ extern struct vnode *rootvp; /* vnode equivalent to above */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ #ifdef INVARIANTS /* The option is always available */ #define KASSERT(exp,msg) do { if (!(exp)) panic msg; } while (0) #else #define KASSERT(exp,msg) #endif /* * General function declarations. */ struct clockframe; struct malloc_type; struct mtx; struct proc; struct timeval; struct tty; struct ucred; struct uio; void Debugger __P((const char *msg)); int nullop __P((void)); int eopnotsupp __P((void)); int einval __P((void)); int seltrue __P((dev_t dev, int which, struct proc *p)); int ureadc __P((int, struct uio *)); void *hashinit __P((int count, struct malloc_type *type, u_long *hashmask)); void *phashinit __P((int count, struct malloc_type *type, u_long *nentries)); void cpu_boot __P((int)); void cpu_rootconf __P((void)); void tablefull __P((const char *)); int kvprintf __P((char const *, void (*)(int, void*), void *, int, _BSD_VA_LIST_)) __printflike(1, 0); void log __P((int, const char *, ...)) __printflike(2, 3); void log_console __P((struct uio *)); int printf __P((const char *, ...)) __printflike(1, 2); int snprintf __P((char *, size_t, const char *, ...)) __printflike(3, 4); int sprintf __P((char *buf, const char *, ...)) __printflike(2, 3); int uprintf __P((const char *, ...)) __printflike(1, 2); int vprintf __P((const char *, _BSD_VA_LIST_)) __printflike(1, 0); int vsnprintf __P((char *, size_t, const char *, _BSD_VA_LIST_)) __printflike(3, 0); int vsprintf __P((char *buf, const char *, _BSD_VA_LIST_)) __printflike(2, 0); int ttyprintf __P((struct tty *, const char *, ...)) __printflike(2, 3); int sscanf __P((const char *, char const *, ...)); int vsscanf __P((const char *, char const *, _BSD_VA_LIST_)); long strtol __P((const char *, char **, int)); u_long strtoul __P((const char *, char **, int)); quad_t strtoq __P((const char *, char **, int)); u_quad_t strtouq __P((const char *, char **, int)); void tprintf __P((struct proc *p, int pri, const char *, ...)) __printflike(3, 4); void bcopy __P((const void *from, void *to, size_t len)); void ovbcopy __P((const void *from, void *to, size_t len)); #ifdef __i386__ extern void (*bzero) __P((void *buf, size_t len)); #else void bzero __P((void *buf, size_t len)); #endif void *memcpy __P((void *to, const void *from, size_t len)); int copystr __P((const void *kfaddr, void *kdaddr, size_t len, size_t *lencopied)); int copyinstr __P((const void *udaddr, void *kaddr, size_t len, size_t *lencopied)); int copyin __P((const void *udaddr, void *kaddr, size_t len)); int copyout __P((const void *kaddr, void *udaddr, size_t len)); int fubyte __P((const void *base)); int subyte __P((void *base, int byte)); int suibyte __P((void *base, int byte)); long fuword __P((const void *base)); int suword __P((void *base, long word)); int fusword __P((void *base)); int susword __P((void *base, int word)); void realitexpire __P((void *)); void hardclock __P((struct clockframe *frame)); void softclock __P((void *)); void statclock __P((struct clockframe *frame)); void startprofclock __P((struct proc *)); void stopprofclock __P((struct proc *)); void setstatclockrate __P((int hzrate)); /* flags for suser_xxx() */ #define PRISON_ROOT 1 int suser __P((const struct proc *)); int suser_xxx __P((const struct ucred *cred, const struct proc *proc, int flag)); char *getenv __P((char *name)); int getenv_int __P((char *name, int *data)); quad_t getenv_quad __P((char *name, quad_t *data)); extern char *kern_envp; #ifdef APM_FIXUP_CALLTODO void adjust_timeout_calltodo __P((struct timeval *time_change)); #endif /* APM_FIXUP_CALLTODO */ #include /* Initialize the world */ void consinit __P((void)); void cpu_initclocks __P((void)); void usrinfoinit __P((void)); /* Finalize the world. */ void shutdown_nice __P((int)); /* * Kernel to clock driver interface. */ void inittodr __P((time_t base)); void resettodr __P((void)); void startrtclock __P((void)); /* Timeouts */ typedef void timeout_t __P((void *)); /* timeout function type */ #define CALLOUT_HANDLE_INITIALIZER(handle) \ { NULL } void callout_handle_init __P((struct callout_handle *)); struct callout_handle timeout __P((timeout_t *, void *, int)); void untimeout __P((timeout_t *, void *, struct callout_handle)); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline void spl0(void) { return; } static __inline intrmask_t splbio(void) { return 0; } static __inline intrmask_t splcam(void) { return 0; } static __inline intrmask_t splclock(void) { return 0; } static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t splsoftcam(void) { return 0; } static __inline intrmask_t splsoftclock(void) { return 0; } static __inline intrmask_t splsofttty(void) { return 0; } static __inline intrmask_t splsoftvm(void) { return 0; } static __inline intrmask_t splsofttq(void) { return 0; } static __inline intrmask_t splstatclock(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline intrmask_t splvm(void) { return 0; } static __inline void splx(intrmask_t ipl) { return; } #if defined(__ia64__) #include #endif /* * Various callout lists. */ /* Exit callout list declarations. */ typedef void (*exitlist_fn) __P((struct proc *procp)); int at_exit __P((exitlist_fn function)); int rm_at_exit __P((exitlist_fn function)); /* Fork callout list declarations. */ typedef void (*forklist_fn) __P((struct proc *parent, struct proc *child, int flags)); int at_fork __P((forklist_fn function)); int rm_at_fork __P((forklist_fn function)); /* * Not exactly a callout LIST, but a callout entry. * Allow an external module to define a hardware watchdog tickler. * Normally a process would do this, but there are times when the * kernel needs to be able to hold off the watchdog, when the process * is not active, e.g., when dumping core. */ typedef void (*watchdog_tickle_fn) __P((void)); extern watchdog_tickle_fn wdog_tickler; /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int msleep __P((void *chan, struct mtx *mtx, int pri, const char *wmesg, int timo)); #define tsleep(chan, pri, wmesg, timo) msleep(chan, NULL, pri, wmesg, timo) int asleep __P((void *chan, int pri, const char *wmesg, int timo)); #define await(pri, timo) mawait(NULL, pri, timo) int mawait __P((struct mtx *mtx, int pri, int timo)); void wakeup __P((void *chan)); void wakeup_one __P((void *chan)); /* * Common `dev_t' stuff are declared here to avoid #include poisoning */ int major(dev_t x); int minor(dev_t x); dev_t makedev(int x, int y); udev_t dev2udev(dev_t x); dev_t udev2dev(udev_t x, int b); int uminor(udev_t dev); int umajor(udev_t dev); udev_t makeudev(int x, int y); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY __P((int usec)); #endif /* !_SYS_SYSTM_H_ */ Index: head/sys/sys/tty.h =================================================================== --- head/sys/sys/tty.h (revision 72375) +++ head/sys/sys/tty.h (revision 72376) @@ -1,276 +1,276 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty.h 8.6 (Berkeley) 1/21/94 * $FreeBSD$ */ #ifndef _SYS_TTY_H_ #define _SYS_TTY_H_ #include #include #include /* * Clists are character lists, which is a variable length linked list * of cblocks, with a count of the number of characters in the list. */ struct clist { int c_cc; /* Number of characters in the clist. */ int c_cbcount; /* Number of cblocks. */ int c_cbmax; /* Max # cblocks allowed for this clist. */ int c_cbreserved; /* # cblocks reserved for this clist. */ char *c_cf; /* Pointer to the first cblock. */ char *c_cl; /* Pointer to the last cblock. */ }; /* * Per-tty structure. * * Should be split in two, into device and tty drivers. * Glue could be masks of what to echo and circular buffer * (low, high, timeout). */ struct tty { struct clist t_rawq; /* Device raw input queue. */ long t_rawcc; /* Raw input queue statistics. */ struct clist t_canq; /* Device canonical queue. */ long t_cancc; /* Canonical queue statistics. */ struct clist t_outq; /* Device output queue. */ long t_outcc; /* Output queue statistics. */ int t_line; /* Interface to device drivers. */ dev_t t_dev; /* Device. */ int t_state; /* Device and driver (TS*) state. */ int t_flags; /* Tty flags. */ int t_timeout; /* Timeout for ttywait() */ struct pgrp *t_pgrp; /* Foreground process group. */ struct session *t_session; /* Enclosing session. */ struct sigio *t_sigio; /* Information for async I/O. */ struct selinfo t_rsel; /* Tty read/oob select. */ struct selinfo t_wsel; /* Tty write select. */ struct termios t_termios; /* Termios state. */ struct winsize t_winsize; /* Window size. */ /* Start output. */ void (*t_oproc) __P((struct tty *)); /* Stop output. */ void (*t_stop) __P((struct tty *, int)); /* Set hardware state. */ int (*t_param) __P((struct tty *, struct termios *)); void *t_sc; /* XXX: net/if_sl.c:sl_softc. */ int t_column; /* Tty output column. */ int t_rocount, t_rocol; /* Tty. */ int t_ififosize; /* Total size of upstream fifos. */ int t_ihiwat; /* High water mark for input. */ int t_ilowat; /* Low water mark for input. */ speed_t t_ispeedwat; /* t_ispeed override for watermarks. */ int t_ohiwat; /* High water mark for output. */ int t_olowat; /* Low water mark for output. */ speed_t t_ospeedwat; /* t_ospeed override for watermarks. */ int t_gen; /* Generation number. */ SLIST_ENTRY(tty) t_list; /* Global chain of ttys for pstat(8) */ }; #define t_cc t_termios.c_cc #define t_cflag t_termios.c_cflag #define t_iflag t_termios.c_iflag #define t_ispeed t_termios.c_ispeed #define t_lflag t_termios.c_lflag #define t_min t_termios.c_min #define t_oflag t_termios.c_oflag #define t_ospeed t_termios.c_ospeed #define t_time t_termios.c_time -#define TTIPRI 25 /* Sleep priority for tty reads. */ -#define TTOPRI 26 /* Sleep priority for tty writes. */ +#define TTIPRI (PSOCK + 1) /* Sleep priority for tty reads. */ +#define TTOPRI (PSOCK + 2) /* Sleep priority for tty writes. */ /* * User data unfortunately has to be copied through buffers on the way to * and from clists. The buffers are on the stack so their sizes must be * fairly small. */ #define IBUFSIZ 384 /* Should be >= max value of MIN. */ #define OBUFSIZ 100 #ifndef TTYHOG #define TTYHOG 1024 #endif #ifdef _KERNEL #define TTMAXHIWAT roundup(2048, CBSIZE) #define TTMINHIWAT roundup(100, CBSIZE) #define TTMAXLOWAT 256 #define TTMINLOWAT 32 #endif /* These flags are kept in t_state. */ #define TS_SO_OLOWAT 0x00001 /* Wake up when output <= low water. */ #define TS_ASYNC 0x00002 /* Tty in async I/O mode. */ #define TS_BUSY 0x00004 /* Draining output. */ #define TS_CARR_ON 0x00008 /* Carrier is present. */ #define TS_FLUSH 0x00010 /* Outq has been flushed during DMA. */ #define TS_ISOPEN 0x00020 /* Open has completed. */ #define TS_TBLOCK 0x00040 /* Further input blocked. */ #define TS_TIMEOUT 0x00080 /* Wait for output char processing. */ #define TS_TTSTOP 0x00100 /* Output paused. */ #ifdef notyet #define TS_WOPEN 0x00200 /* Open in progress. */ #endif #define TS_XCLUDE 0x00400 /* Tty requires exclusivity. */ /* State for intra-line fancy editing work. */ #define TS_BKSL 0x00800 /* State for lowercase \ work. */ #define TS_CNTTB 0x01000 /* Counting tab width, ignore FLUSHO. */ #define TS_ERASE 0x02000 /* Within a \.../ for PRTRUB. */ #define TS_LNCH 0x04000 /* Next character is literal. */ #define TS_TYPEN 0x08000 /* Retyping suspended input (PENDIN). */ #define TS_LOCAL (TS_BKSL | TS_CNTTB | TS_ERASE | TS_LNCH | TS_TYPEN) /* Extras. */ #define TS_CAN_BYPASS_L_RINT 0x010000 /* Device in "raw" mode. */ #define TS_CONNECTED 0x020000 /* Connection open. */ #define TS_SNOOP 0x040000 /* Device is being snooped on. */ #define TS_SO_OCOMPLETE 0x080000 /* Wake up when output completes. */ #define TS_ZOMBIE 0x100000 /* Connection lost. */ /* Hardware flow-control-invoked bits. */ #define TS_CAR_OFLOW 0x200000 /* For MDMBUF (XXX handle in driver). */ #ifdef notyet #define TS_CTS_OFLOW 0x400000 /* For CCTS_OFLOW. */ #define TS_DSR_OFLOW 0x800000 /* For CDSR_OFLOW. */ #endif /* Character type information. */ #define ORDINARY 0 #define CONTROL 1 #define BACKSPACE 2 #define NEWLINE 3 #define TAB 4 #define VTAB 5 #define RETURN 6 struct speedtab { int sp_speed; /* Speed. */ int sp_code; /* Code. */ }; /* Modem control commands (driver). */ #define DMSET 0 #define DMBIS 1 #define DMBIC 2 #define DMGET 3 /* Flags on a character passed to ttyinput. */ #define TTY_CHARMASK 0x000000ff /* Character mask */ #define TTY_QUOTE 0x00000100 /* Character quoted */ #define TTY_ERRORMASK 0xff000000 /* Error mask */ #define TTY_FE 0x01000000 /* Framing error */ #define TTY_PE 0x02000000 /* Parity error */ #define TTY_OE 0x04000000 /* Overrun error */ #define TTY_BI 0x08000000 /* Break condition */ /* Is tp controlling terminal for p? */ #define isctty(p, tp) \ ((p)->p_session == (tp)->t_session && (p)->p_flag & P_CONTROLT) /* Is p in background of tp? */ #define isbackground(p, tp) \ (isctty((p), (tp)) && (p)->p_pgrp != (tp)->t_pgrp) /* Unique sleep addresses. */ #define TSA_CARR_ON(tp) ((void *)&(tp)->t_rawq) #define TSA_HUP_OR_INPUT(tp) ((void *)&(tp)->t_rawq.c_cf) #define TSA_OCOMPLETE(tp) ((void *)&(tp)->t_outq.c_cl) #define TSA_OLOWAT(tp) ((void *)&(tp)->t_outq) #define TSA_PTC_READ(tp) ((void *)&(tp)->t_outq.c_cf) #define TSA_PTC_WRITE(tp) ((void *)&(tp)->t_rawq.c_cl) #define TSA_PTS_READ(tp) ((void *)&(tp)->t_canq) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_TTYS); #endif extern struct tty *constty; /* Temporary virtual console. */ int b_to_q __P((char *cp, int cc, struct clist *q)); void catq __P((struct clist *from, struct clist *to)); void clist_alloc_cblocks __P((struct clist *q, int ccmax, int ccres)); void clist_free_cblocks __P((struct clist *q)); int getc __P((struct clist *q)); void ndflush __P((struct clist *q, int cc)); char *nextc __P((struct clist *q, char *cp, int *c)); void nottystop __P((struct tty *tp, int rw)); int putc __P((int c, struct clist *q)); int q_to_b __P((struct clist *q, char *cp, int cc)); void termioschars __P((struct termios *t)); int tputchar __P((int c, struct tty *tp)); int ttcompat __P((struct tty *tp, u_long com, caddr_t data, int flag)); int ttioctl __P((struct tty *tp, u_long com, void *data, int flag)); int ttread __P((struct tty *tp, struct uio *uio, int flag)); void ttrstrt __P((void *tp)); int ttsetcompat __P((struct tty *tp, u_long *com, caddr_t data, struct termios *term)); void ttsetwater __P((struct tty *tp)); int ttspeedtab __P((int speed, struct speedtab *table)); int ttstart __P((struct tty *tp)); void ttwakeup __P((struct tty *tp)); int ttwrite __P((struct tty *tp, struct uio *uio, int flag)); void ttwwakeup __P((struct tty *tp)); void ttyblock __P((struct tty *tp)); void ttychars __P((struct tty *tp)); int ttycheckoutq __P((struct tty *tp, int wait)); int ttyclose __P((struct tty *tp)); void ttyflush __P((struct tty *tp, int rw)); void ttyfree __P((struct tty *tp)); void ttyinfo __P((struct tty *tp)); int ttyinput __P((int c, struct tty *tp)); int ttylclose __P((struct tty *tp, int flag)); struct tty *ttymalloc __P((struct tty *tp)); int ttymodem __P((struct tty *tp, int flag)); int ttyopen __P((dev_t device, struct tty *tp)); int ttypoll __P((dev_t dev, int events, struct proc *p)); int ttyread __P((dev_t dev, struct uio *uio, int flag)); void ttyregister __P((struct tty *tp)); int ttysleep __P((struct tty *tp, void *chan, int pri, char *wmesg, int timeout)); int ttywait __P((struct tty *tp)); int ttywrite __P((dev_t dev, struct uio *uio, int flag)); int unputc __P((struct clist *q)); #endif /* _KERNEL */ #endif /* !_SYS_TTY_H_ */ Index: head/sys/sys/user.h =================================================================== --- head/sys/sys/user.h (revision 72375) +++ head/sys/sys/user.h (revision 72376) @@ -1,175 +1,172 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)user.h 8.2 (Berkeley) 9/23/93 * $FreeBSD$ */ #ifndef _SYS_USER_H_ #define _SYS_USER_H_ #include #ifndef _KERNEL /* stuff that *used* to be included by user.h, or is now needed */ #include #include #include #include #include #include #include /* XXX */ #include /* XXX */ #include /* XXX */ #include /* XXX */ #include /* XXX */ #endif /* !_KERNEL */ #ifndef _SYS_RESOURCEVAR_H_ #include #endif #ifndef _SYS_SIGNALVAR_H_ #include #endif /* * KERN_PROC subtype ops return arrays of selected proc structure entries: * * When adding new fields to this structure, ALWAYS add them at the end * and decrease the size of the spare field by the amount of space that * you are adding. Byte aligned data should be added to the ki_sparestring * space; other entries should be added to the ki_spare space. Always * verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE when you are * done. If you change the size of this structure, many programs will stop * working! Once you have added the new field, you will need to add code * to initialize it in two places: kern/kern_proc.c in the function * fill_kinfo_proc and in lib/libkvm/kvm_proc.c in the function kvm_proclist. */ #ifdef __alpha__ -#define KINFO_PROC_SIZE 904 /* the correct size for kinfo_proc */ +#define KINFO_PROC_SIZE 912 /* the correct size for kinfo_proc */ #else -#define KINFO_PROC_SIZE 640 /* the correct size for kinfo_proc */ +#define KINFO_PROC_SIZE 644 /* the correct size for kinfo_proc */ #endif #define WMESGLEN 8 /* size of returned wchan message */ #define MTXNAMELEN 8 /* size of returned mutex name */ struct kinfo_proc { int ki_structsize; /* size of this structure */ struct pargs *ki_args; /* address of command arguments */ struct proc *ki_paddr; /* address of proc */ struct user *ki_addr; /* kernel virtual addr of u-area */ struct vnode *ki_tracep; /* pointer to trace file */ struct vnode *ki_textvp; /* pointer to executable file */ struct filedesc *ki_fd; /* pointer to open file info */ struct vmspace *ki_vmspace; /* pointer to kernel vmspace struct */ void *ki_wchan; /* sleep address */ pid_t ki_pid; /* Process identifier */ pid_t ki_ppid; /* parent process id */ pid_t ki_pgid; /* process group id */ pid_t ki_tpgid; /* tty process group id */ pid_t ki_sid; /* Process session ID */ pid_t ki_tsid; /* Terminal session ID */ short ki_jobc; /* job control counter */ udev_t ki_tdev; /* controlling tty dev */ sigset_t ki_siglist; /* Signals arrived but not delivered */ sigset_t ki_sigmask; /* Current signal mask */ sigset_t ki_sigignore; /* Signals being ignored */ sigset_t ki_sigcatch; /* Signals being caught by user */ uid_t ki_uid; /* effective user id */ uid_t ki_ruid; /* Real user id */ uid_t ki_svuid; /* Saved effective user id */ gid_t ki_rgid; /* Real group id */ gid_t ki_svgid; /* Saved effective group id */ short ki_ngroups; /* number of groups */ gid_t ki_groups[NGROUPS]; /* groups */ vm_size_t ki_size; /* virtual size */ segsz_t ki_rssize; /* current resident set size in pages */ segsz_t ki_swrss; /* resident set size before last swap */ segsz_t ki_tsize; /* text size (pages) XXX */ segsz_t ki_dsize; /* data size (pages) XXX */ segsz_t ki_ssize; /* stack size (pages) */ u_short ki_xstat; /* Exit status for wait & stop signal */ u_short ki_acflag; /* Accounting flags */ fixpt_t ki_pctcpu; /* %cpu for process during ki_swtime */ u_int ki_estcpu; /* Time averaged value of ki_cpticks */ u_int ki_slptime; /* Time since last blocked */ u_int ki_swtime; /* Time swapped in or out */ u_int64_t ki_runtime; /* Real time in microsec */ struct timeval ki_start; /* starting time */ struct timeval ki_childtime; /* time used by process children */ long ki_flag; /* P_* flags */ long ki_kiflag; /* KI_* flags (below) */ int ki_traceflag; /* Kernel trace points */ - u_char ki_priority; /* Process priority */ - u_char ki_usrpri; /* User-priority based on p_cpu */ - u_char ki_nativepri; /* Priority before propogation */ char ki_stat; /* S* process status */ char ki_nice; /* Process "nice" value */ char ki_lock; /* Process lock (prevent swap) count */ char ki_rqindex; /* Run queue index */ u_char ki_oncpu; /* Which cpu we are on */ u_char ki_lastcpu; /* Last cpu we were on */ char ki_comm[MAXCOMLEN+1]; /* command name */ char ki_wmesg[WMESGLEN+1]; /* wchan message */ char ki_login[MAXLOGNAME+1]; /* setlogin name */ char ki_mtxname[MTXNAMELEN+1]; /* mutex name */ char ki_sparestrings[102]; /* spare string space */ - struct rtprio ki_rtprio; /* Realtime priority */ struct rusage ki_rusage; /* process rusage statistics */ long ki_sflag; /* PS_* flags */ - long ki_spare[24]; /* spare constants */ + struct priority ki_pri; /* process priority */ + long ki_spare[25]; /* spare constants */ }; void fill_kinfo_proc __P((struct proc *, struct kinfo_proc *)); /* ki_sessflag values */ #define KI_CTTY 0x00000001 /* controlling tty vnode active */ #define KI_SLEADER 0x00000002 /* session leader */ #define KI_MTXBLOCK 0x00000004 /* proc blocked on mutex ki_mtxname */ /* * Per process structure containing data that isn't needed in core * when the process isn't running (esp. when swapped out). * This structure may or may not be at the same kernel address * in all processes. */ struct user { struct pcb u_pcb; struct sigacts u_sigacts; /* p_sigacts points here (use it!) */ struct pstats u_stats; /* p_stats points here (use it!) */ /* * Remaining fields only for core dump and/or ptrace-- * not valid at other times! */ struct kinfo_proc u_kproc; /* proc + eproc */ struct md_coredump u_md; /* machine dependent glop */ }; #endif Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 72375) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 72376) @@ -1,1076 +1,1076 @@ /* * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED proc0.p_ucred #define DEBUG 1 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, int, int, int, int)); static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); static int readblock __P((struct buf *, daddr_t)); #ifdef DEBUG #include int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs_daddr_t rlbn; ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; int error, cg, snaploc, indiroff, numblks; int i, size, base, len, loc, inoblkcnt; int blksperindir, flag = mp->mnt_flag; void *space; struct fs *copy_fs, *fs = VFSTOUFS(mp)->um_fs; struct proc *p = CURPROC; struct inode *devip, *ip, *xp; struct buf *bp, *nbp, *ibp; struct vnode *vp, *devvp; struct nameidata nd; struct mount *wrtmp; struct dinode *dip; struct vattr vat; struct cg *cgp; /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); vput(nd.ni_dvp); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); return (error); } vp = nd.ni_vp; ip = VTOI(vp); devvp = ip->i_devvp; devip = VTOI(devvp); /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); ip->i_flag |= IN_CHANGE | IN_UPDATE; if ((error = readblock(bp, numblks - 1)) != 0) goto out; bawrite(bp); /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks. Also allocate shadow copies * for each of the indirect blocks. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); if (error) goto out; copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); bdwrite(ibp); error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), fs->fs_bsize, p->p_ucred, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate shadow blocks to copy all of the other snapshot inodes * so that we will be able to expunge them from this snapshot. */ for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); for (i = 0; i < inoblkcnt; i++) if (inoblks[i] == blkno) break; if (i == inoblkcnt) { inoblks[inoblkcnt++] = blkno; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate copies for the superblock and its summary information. */ if ((error = VOP_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp))) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. */ if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) goto out; /* * All allocations are done, so we can now snapshot the system. * * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); vfs_write_suspend(vp->v_mount); if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } /* * First, copy all the cylinder group maps. All the unallocated * blocks are marked BLK_NOCOPY so that the snapshot knows that * it need not copy them if they are later written. */ len = howmany(fs->fs_fpg, fs->fs_frag); for (cg = 0; cg < fs->fs_ncg; cg++) { error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); goto out1; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); error = EIO; goto out1; } error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(bp); brelse(nbp); goto out1; } bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); nbp->b_flags |= B_VALIDSUSPWRT; bawrite(nbp); base = cg * fs->fs_fpg / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < NDADDR) { for ( ; loc < NDADDR; loc++) { if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) continue; ip->i_db[loc] = BLK_NOCOPY; } } error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); if (error) { brelse(bp); goto out1; } indiroff = (base + loc - NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); if (error) { brelse(bp); goto out1; } indiroff = 0; } if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) continue; if (((ufs_daddr_t *)(ibp->b_data))[indiroff] != 0) panic("ffs_snapshot: lost block"); ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; } bqrelse(bp); ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); } /* * Snapshot the superblock and its summary information. */ if ((error = VOP_BALLOC(vp, SBOFF, SBSIZE, KERNCRED, 0, &nbp)) != 0) goto out1; copy_fs = (struct fs *)(nbp->b_data + blkoff(fs, SBOFF)); bcopy(fs, copy_fs, fs->fs_sbsize); copy_fs->fs_clean = 1; if (fs->fs_sbsize < SBSIZE) bzero(&nbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize], SBSIZE - fs->fs_sbsize); nbp->b_flags |= B_VALIDSUSPWRT; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; size = fs->fs_bsize; space = fs->fs_csp; for (loc = 0; loc <= len; loc++) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; if (loc == len) { readblock(nbp, blkno + loc); size = fs->fs_cssize - loc * fs->fs_bsize; } bcopy(space, nbp->b_data, size); space = (char *)space + size; nbp->b_flags |= B_VALIDSUSPWRT; bawrite(nbp); } /* * Copy the shadow blocks for the snapshot inodes so that * the copies can can be expunged. */ for (loc = 0; loc < inoblkcnt; loc++) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; readblock(nbp, inoblks[loc]); nbp->b_flags |= B_VALIDSUSPWRT; bdwrite(nbp); } /* * Copy allocation information from other snapshots and then * expunge them from the view of the current snapshot. */ for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. */ if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) goto out1; blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, rlbn, len, blksperindir); if (error) goto out1; blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } /* * Set copied snapshot inode to be a zero length file. */ blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; dip = (struct dinode *)nbp->b_data + ino_to_fsbo(fs, xp->i_number); dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); nbp->b_flags |= B_VALIDSUSPWRT; bdwrite(nbp); } /* * Copy all indirect blocks to their shadows (allocated above) * to avoid deadlock in ffs_copyonwrite. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); if (error) goto out1; copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); bqrelse(ibp); error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), fs->fs_bsize, p->p_ucred, 0, &nbp); if (error) goto out1; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); if (error) { brelse(nbp); goto out1; } bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); bqrelse(ibp); nbp->b_flags |= B_VALIDSUSPWRT; bawrite(nbp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_copyonwrite != 0) panic("ffs_snapshot: %d already on list", ip->i_number); if (devip->i_copyonwrite == 0) { devvp->v_flag |= VCOPYONWRITE; devip->i_copyonwrite = ip; } else { for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) xp = xp->i_copyonwrite; xp->i_copyonwrite = ip; } vp->v_flag |= VSYSTEM; /* * Resume operation on filesystem. */ out1: vfs_write_resume(vp->v_mount); vn_start_write(NULL, &wrtmp, V_WAIT); out: mp->mnt_flag = flag; (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); if (error) vput(vp); else VOP_UNLOCK(vp, 0, p); vn_finished_write(wrtmp); return (error); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs_daddr_t blkno; int lbn; int rlbn; int remblks; int blksperindir; { int subblksperindir, error, last, num, i; struct indir indirs[NIADDR + 2]; ufs_daddr_t *bap; struct buf *bp; struct fs *fs; if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) panic("indiracct: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ fs = VTOI(cancelvp)->i_fs; bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); if (snapvp != cancelvp) { bap = (ufs_daddr_t *)bp->b_data; } else { MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); } error = snapacct(snapvp, &bap[0], &bap[last]); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: if (snapvp != cancelvp) bqrelse(bp); else FREE(bap, M_DEVBUF); return (error); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int snapacct(vp, oldblkp, lastblkp) struct vnode *vp; ufs_daddr_t *oldblkp, *lastblkp; { struct inode *ip = VTOI(vp); struct fs *fs = ip->i_fs; ufs_daddr_t lbn, blkno, *blkp; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); if (error) return (error); blkp = &((ufs_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } if (*blkp != 0) panic("snapacct: bad block"); *blkp = BLK_SNAP; if (lbn >= NDADDR) { ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); } } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; /* * Find snapshot in incore list. */ for (xp = VTOI(ip->i_devvp); xp; xp = xp->i_copyonwrite) if (xp->i_copyonwrite == ip) break; if (xp == 0) printf("ffs_snapgone: lost snapshot vnode %d\n", ip->i_number); else vrele(ITOV(ip)); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip, *xp; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs_daddr_t blkno, dblk; int error, snaploc, loc, last; ip = VTOI(vp); fs = ip->i_fs; /* * Delete snapshot inode from superblock. Keep list dense. */ for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } /* * Delete from incore list. * Clear copy-on-write flag if last snapshot. */ devvp = ip->i_devvp; for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { if (xp->i_copyonwrite != ip) continue; xp->i_copyonwrite = ip->i_copyonwrite; ip->i_copyonwrite = 0; break; } if (xp == 0) printf("ffs_snapremove: lost snapshot vnode %d\n", ip->i_number); if (VTOI(devvp)->i_copyonwrite == 0) devvp->v_flag &= ~VCOPYONWRITE; /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < NDADDR; blkno++) { dblk = ip->i_db[blkno]; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || (dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(ip, dblk, fs->fs_bsize))) ip->i_db[blkno] = 0; } for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); if (error) continue; if ((last = fs->fs_size - blkno) > NINDIR(fs)) last = NINDIR(fs); for (loc = 0; loc < last; loc++) { dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || (dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(ip, dblk, fs->fs_bsize))) ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(freeip, bno, size) struct inode *freeip; ufs_daddr_t bno; long size; { struct buf *ibp, *cbp, *savedcbp = 0; struct fs *fs = freeip->i_fs; struct proc *p = CURPROC; struct inode *ip; struct vnode *vp; ufs_daddr_t lbn, blkno; int indiroff = 0, error = 0, claimedblk = 0; lbn = fragstoblks(fs, bno); for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < NDADDR) { blkno = ip->i_db[lbn]; } else { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); p->p_flag |= P_COWINPROGRESS; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); p->p_flag &= ~P_COWINPROGRESS; VOP_UNLOCK(vp, 0, p); if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ switch (blkno) { /* * If the snapshot has already copied the block (default), * or does not care about the block, it is not needed. */ default: case BLK_NOCOPY: if (lbn >= NDADDR) bqrelse(ibp); continue; /* * No previous snapshot claimed the block, so it will be * freed and become a BLK_NOCOPY (don't care) for us. */ case BLK_SNAP: if (claimedblk) panic("snapblkfree: inconsistent block type"); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (lbn < NDADDR) { ip->i_db[lbn] = BLK_NOCOPY; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } VOP_UNLOCK(vp, 0, p); continue; /* * A block that we map is being freed. If it has not been * claimed yet, we will claim or copy it (below). */ case 0: claimedblk = 1; break; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %d lbn %d from inum %d\n", "Grabonremove: snapino", ip->i_number, lbn, freeip->i_number); #endif vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (lbn < NDADDR) { ip->i_db[lbn] = bno; } else { ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } ip->i_blocks += btodb(size); ip->i_flag |= IN_CHANGE | IN_UPDATE; VOP_UNLOCK(vp, 0, p); return (1); } if (lbn >= NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); p->p_flag |= P_COWINPROGRESS; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); p->p_flag &= ~P_COWINPROGRESS; VOP_UNLOCK(vp, 0, p); if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", "Copyonremove: snapino ", ip->i_number, lbn, freeip->i_number, size, cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(cbp, lbn)) != 0) break; savedcbp = cbp; } if (savedcbp) bawrite(savedcbp); /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs = ump->um_fs; struct proc *p = CURPROC; struct inode *ip, **listtailp; struct vnode *vp; int error, snaploc, loc; listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) return; if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & SF_SNAPSHOT) == 0) { printf("ffs_snapshot_mount: non-snapshot inode %d\n", fs->fs_snapinum[snaploc]); vput(vp); for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } if (ip->i_copyonwrite != 0) panic("ffs_snapshot_mount: %d already on list", ip->i_number); *listtailp = ip; listtailp = &ip->i_copyonwrite; vp->v_flag |= VSYSTEM; VOP_UNLOCK(vp, 0, p); ump->um_devvp->v_flag |= VCOPYONWRITE; } } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct inode *devip = VTOI(ump->um_devvp); struct inode *xp; while ((xp = devip->i_copyonwrite) != 0) { devip->i_copyonwrite = xp->i_copyonwrite; xp->i_copyonwrite = 0; if (xp->i_effnlink > 0) vrele(ITOV(xp)); } ump->um_devvp->v_flag &= ~VCOPYONWRITE; } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(ap) struct vop_copyonwrite_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; struct fs *fs = VTOI(bp->b_vp)->i_fs; struct proc *p = CURPROC; struct inode *ip; struct vnode *vp; ufs_daddr_t lbn, blkno; int indiroff, error = 0; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); if (p->p_flag & P_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in VOP_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We have to * be able to do the VOP_BALLOC without blocking, otherwise * we may get in a deadlock with another process also * trying to allocate. If we find outselves unable to * get the buffer lock, we unlock the snapshot vnode, * sleep briefly, and try again. */ retry: vn_lock(vp, LK_SHARED | LK_RETRY, p); if (lbn < NDADDR) { blkno = ip->i_db[lbn]; } else { p->p_flag |= P_COWINPROGRESS; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); p->p_flag &= ~P_COWINPROGRESS; if (error) { VOP_UNLOCK(vp, 0, p); if (error != EWOULDBLOCK) break; - tsleep(vp, p->p_usrpri, "nap", 1); + tsleep(vp, p->p_pri.pri_user, "nap", 1); goto retry; } indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef DIAGNOSTIC if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) { VOP_UNLOCK(vp, 0, p); continue; } /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ p->p_flag |= P_COWINPROGRESS; error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); p->p_flag &= ~P_COWINPROGRESS; VOP_UNLOCK(vp, 0, p); if (error) { if (error != EWOULDBLOCK) break; - tsleep(vp, p->p_usrpri, "nap", 1); + tsleep(vp, p->p_pri.pri_user, "nap", 1); goto retry; } #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %d lbn %d for ", ip->i_number, lbn); if (bp->b_vp == ap->a_vp) printf("fs metadata"); else printf("inum %d", VTOI(bp->b_vp)->i_number); printf(" lblkno %d to blkno %d\n", bp->b_lblkno, cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(cbp, lbn)) != 0) break; savedcbp = cbp; } if (savedcbp) bawrite(savedcbp); return (error); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(bp, lbn) struct buf *bp; daddr_t lbn; { struct uio auio; struct iovec aiov; struct proc *p = CURPROC; struct inode *ip = VTOI(bp->b_vp); aiov.iov_base = bp->b_data; aiov.iov_len = bp->b_bcount; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); auio.uio_resid = bp->b_bcount; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = p; return (physio(ip->i_devvp->v_rdev, &auio, 0)); } Index: head/sys/vm/vm_glue.c =================================================================== --- head/sys/vm/vm_glue.c (revision 72375) +++ head/sys/vm/vm_glue.c (revision 72376) @@ -1,577 +1,577 @@ /* * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ #include "opt_rlimit.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization * * Note: proc0 from proc.h */ static void vm_init_limits __P((void *)); SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0) /* * THIS MUST BE THE LAST INITIALIZATION ITEM!!! * * Note: run scheduling should be divorced from the vm system. */ static void scheduler __P((void *)); SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL) static void swapout __P((struct proc *)); int kernacc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & (~VM_PROT_ALL)) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } int useracc(addr, len, rw) caddr_t addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; vm_map_entry_t save_hint; KASSERT((rw & (~VM_PROT_ALL)) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; /* * XXX - check separately to disallow access to user area and user * page tables - they are in the map. * * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once * only used (as an end address) in trap.c. Use it as an end address * here too. This bogusness has spread. I just fixed where it was * used as a max in vm_mmap.c. */ if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS || (vm_offset_t) addr + len < (vm_offset_t) addr) { return (FALSE); } map = &curproc->p_vmspace->vm_map; vm_map_lock_read(map); /* * We save the map hint, and restore it. Useracc appears to distort * the map hint unnecessarily. */ save_hint = map->hint; rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); map->hint = save_hint; vm_map_unlock_read(map); return (rv == TRUE); } void vslock(addr, len) caddr_t addr; u_int len; { vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), FALSE); } void vsunlock(addr, len) caddr_t addr; u_int len; { vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), TRUE); } /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ void vm_fork(p1, p2, flags) register struct proc *p1, *p2; int flags; { register struct user *up; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { vmspace_unshare(p1); } } cpu_fork(p1, p2, flags); return; } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; p1->p_vmspace->vm_refcnt++; } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { p2->p_vmspace = vmspace_fork(p1->p_vmspace); pmap_pinit2(vmspace_pmap(p2->p_vmspace)); if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } pmap_new_proc(p2); up = p2->p_addr; /* * p_stats currently points at fields in the user struct * but not at &u, instead at p_addr. Copy parts of * p_stats; zero the rest of p_stats (statistics). * * If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need * to share sigacts, so we use the up->u_sigacts. */ p2->p_stats = &up->u_stats; if (p2->p_sigacts == NULL) { if (p2->p_procsig->ps_refcnt != 1) printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid); p2->p_sigacts = &up->u_sigacts; up->u_sigacts = *p1->p_sigacts; } bzero(&up->u_stats.pstat_startzero, (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - (caddr_t) &up->u_stats.pstat_startzero)); bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, ((caddr_t) &up->u_stats.pstat_endcopy - (caddr_t) &up->u_stats.pstat_startcopy)); /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(p1, p2, flags); } /* * Set default limits for VM system. * Called for proc 0, and then inherited by all others. * * XXX should probably act directly on proc0. */ static void vm_init_limits(udata) void *udata; { register struct proc *p = udata; int rss_limit; /* * Set up the initial limits on process VM. Set the maximum resident * set size to be half of (reasonably) available memory. Since this * is a soft limit, it comes into effect only when the system is out * of memory - half of main memory helps to favor smaller processes, * and reduces thrashing of the object cache. */ p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; /* limit the limit to no less than 2MB */ rss_limit = max(cnt.v_free_count, 512); p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit); p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY; } /* * Must be called with the proc struc mutex held. */ void faultin(p) struct proc *p; { mtx_assert(&p->p_mtx, MA_OWNED); mtx_lock_spin(&sched_lock); if ((p->p_sflag & PS_INMEM) == 0) { ++p->p_lock; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); mtx_assert(&Giant, MA_OWNED); pmap_swapin_proc(p); PROC_LOCK(p); mtx_lock_spin(&sched_lock); if (p->p_stat == SRUN) { setrunqueue(p); } p->p_sflag |= PS_INMEM; /* undo the effect of setting SLOCK above */ --p->p_lock; } mtx_unlock_spin(&sched_lock); } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * Giant is still held at this point, to be released in tsleep. */ /* ARGSUSED*/ static void scheduler(dummy) void *dummy; { register struct proc *p; register int pri; struct proc *pp; int ppri; mtx_assert(&Giant, MA_OWNED); loop: if (vm_page_count_min()) { VM_WAIT; goto loop; } pp = NULL; ppri = INT_MIN; ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) { mtx_lock_spin(&sched_lock); if (p->p_stat == SRUN && (p->p_sflag & (PS_INMEM | PS_SWAPPING)) == 0) { pri = p->p_swtime + p->p_slptime; if ((p->p_sflag & PS_SWAPINREQ) == 0) { pri -= p->p_nice * 8; } /* * if this process is higher priority and there is * enough space, then select this process instead of * the previous selection. */ if (pri > ppri) { pp = p; ppri = pri; } } mtx_unlock_spin(&sched_lock); } ALLPROC_LOCK(AP_RELEASE); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "sched", 0); goto loop; } mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPINREQ; mtx_unlock_spin(&sched_lock); /* * We would like to bring someone in. (only if there is space). */ PROC_LOCK(p); faultin(p); PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); p->p_swtime = 0; mtx_unlock_spin(&sched_lock); goto loop; } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, ""); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, ""); /* * Swapout is driven by the pageout daemon. Very simple, we find eligible * procs and unwire their u-areas. We try to always "swap" at least one * process in case we need the room for a swapin. * If any procs have been sleeping/stopped for at least maxslp seconds, * they are swapped. Else, we swap the longest-sleeping or stopped process, * if any, otherwise the longest-resident process. */ void swapout_procs(action) int action; { register struct proc *p; struct proc *outp, *outp2; int outpri, outpri2; int didswap = 0; outp = outp2 = NULL; outpri = outpri2 = INT_MIN; ALLPROC_LOCK(AP_SHARED); retry: LIST_FOREACH(p, &allproc, p_list) { struct vmspace *vm; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_TRACED|P_SYSTEM|P_WEXIT)) != 0) { PROC_UNLOCK(p); continue; } vm = p->p_vmspace; PROC_UNLOCK(p); mtx_lock_spin(&sched_lock); if ((p->p_sflag & (PS_INMEM|PS_SWAPPING)) != PS_INMEM) { mtx_unlock_spin(&sched_lock); continue; } switch (p->p_stat) { default: mtx_unlock_spin(&sched_lock); continue; case SSLEEP: case SSTOP: /* * do not swapout a realtime process */ - if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) { + if (PRI_IS_REALTIME(p->p_pri.pri_class)) { mtx_unlock_spin(&sched_lock); continue; } /* * Do not swapout a process waiting on a critical * event of some kind. Also guarantee swap_idle_threshold1 * time in memory. */ - if (((p->p_priority & 0x7f) < PSOCK) || + if (((p->p_pri.pri_level) < PSOCK) || (p->p_slptime < swap_idle_threshold1)) { mtx_unlock_spin(&sched_lock); continue; } /* * If the system is under memory stress, or if we are swapping * idle processes >= swap_idle_threshold2, then swap the process * out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (p->p_slptime < swap_idle_threshold2))) { mtx_unlock_spin(&sched_lock); continue; } mtx_unlock_spin(&sched_lock); ++vm->vm_refcnt; /* * do not swapout a process that is waiting for VM * data structures there is a possible deadlock. */ if (lockmgr(&vm->vm_map.lock, LK_EXCLUSIVE | LK_NOWAIT, (void *)0, curproc)) { vmspace_free(vm); continue; } vm_map_unlock(&vm->vm_map); /* * If the process has been asleep for awhile and had * most of its pages taken away already, swap it out. */ mtx_lock_spin(&sched_lock); if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (p->p_slptime > swap_idle_threshold2))) { mtx_unlock_spin(&sched_lock); swapout(p); vmspace_free(vm); didswap++; goto retry; } else mtx_unlock_spin(&sched_lock); } } ALLPROC_LOCK(AP_RELEASE); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapout(p) register struct proc *p; { #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif ++p->p_stats->p_ru.ru_nswap; /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); (void) splhigh(); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_INMEM; p->p_sflag |= PS_SWAPPING; if (p->p_stat == SRUN) remrunqueue(p); mtx_unlock_spin(&sched_lock); (void) spl0(); pmap_swapout_proc(p); mtx_lock_spin(&sched_lock); p->p_sflag &= ~PS_SWAPPING; p->p_swtime = 0; mtx_unlock_spin(&sched_lock); } #endif /* !NO_SWAPPING */ Index: head/sys/vm/vm_meter.c =================================================================== --- head/sys/vm/vm_meter.c (revision 72375) +++ head/sys/vm/vm_meter.c (revision 72376) @@ -1,360 +1,361 @@ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ struct vmmeter cnt; static int maxslp = MAXSLP; /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(struct loadavg *avg) { register int i, nrun; register struct proc *p; ALLPROC_LOCK(AP_SHARED); for (nrun = 0, p = LIST_FIRST(&allproc); p != 0; p = LIST_NEXT(p, p_list)) { switch (p->p_stat) { case SSLEEP: - if (p->p_priority > PZERO || p->p_slptime != 0) + if (p->p_pri.pri_level > PZERO || + p->p_slptime != 0) continue; /* FALLTHROUGH */ case SRUN: if ((p->p_flag & P_NOLOAD) != 0) continue; /* FALLTHROUGH */ case SIDL: nrun++; } } ALLPROC_LOCK(AP_RELEASE); for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; } void vmmeter() { if (time_second % 5 == 0) loadav(&averunnable); if (proc0.p_slptime > maxslp / 2) wakeup(&proc0); } SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &cnt.v_free_min, 0, ""); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &cnt.v_free_target, 0, ""); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &cnt.v_free_reserved, 0, ""); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &cnt.v_inactive_target, 0, ""); SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min, CTLFLAG_RW, &cnt.v_cache_min, 0, ""); SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max, CTLFLAG_RW, &cnt.v_cache_max, 0, ""); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &cnt.v_pageout_free_min, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &cnt.v_free_severe, 0, ""); SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, &averunnable, loadavg, "Machine loadaverage history"); static int vmtotal(SYSCTL_HANDLER_ARGS) { struct proc *p; struct vmtotal total, *totalp; vm_map_entry_t entry; vm_object_t object; vm_map_t map; int paging; totalp = &total; bzero(totalp, sizeof *totalp); /* * Mark all objects as inactive. */ TAILQ_FOREACH(object, &vm_object_list, object_list) vm_object_clear_flag(object, OBJ_ACTIVE); /* * Calculate process statistics. */ ALLPROC_LOCK(AP_SHARED); LIST_FOREACH(p, &allproc, p_list) { if (p->p_flag & P_SYSTEM) continue; mtx_lock_spin(&sched_lock); switch (p->p_stat) { case 0: mtx_unlock_spin(&sched_lock); continue; case SMTX: case SSLEEP: case SSTOP: if (p->p_sflag & PS_INMEM) { - if (p->p_priority <= PZERO) + if (p->p_pri.pri_level <= PZERO) totalp->t_dw++; else if (p->p_slptime < maxslp) totalp->t_sl++; } else if (p->p_slptime < maxslp) totalp->t_sw++; if (p->p_slptime >= maxslp) { mtx_unlock_spin(&sched_lock); continue; } break; case SWAIT: totalp->t_sl++; continue; case SRUN: case SIDL: if (p->p_sflag & PS_INMEM) totalp->t_rq++; else totalp->t_sw++; if (p->p_stat == SIDL) { mtx_unlock_spin(&sched_lock); continue; } break; } mtx_unlock_spin(&sched_lock); /* * Note active objects. */ paging = 0; for (map = &p->p_vmspace->vm_map, entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) || entry->object.vm_object == NULL) continue; vm_object_set_flag(entry->object.vm_object, OBJ_ACTIVE); paging |= entry->object.vm_object->paging_in_progress; } if (paging) totalp->t_pw++; } ALLPROC_LOCK(AP_RELEASE); /* * Calculate object memory usage statistics. */ for (object = TAILQ_FIRST(&vm_object_list); object != NULL; object = TAILQ_NEXT(object, object_list)) { /* * devices, like /dev/mem, will badly skew our totals */ if (object->type == OBJT_DEVICE) continue; totalp->t_vm += object->size; totalp->t_rm += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avm += object->size; totalp->t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ totalp->t_vmshr += object->size; totalp->t_rmshr += object->resident_page_count; if (object->flags & OBJ_ACTIVE) { totalp->t_avmshr += object->size; totalp->t_armshr += object->resident_page_count; } } } totalp->t_free = cnt.v_free_count + cnt.v_cache_count; return (sysctl_handle_opaque(oidp, totalp, sizeof total, req)); } SYSCTL_PROC(_vm, VM_METER, vmmeter, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_swtch, CTLFLAG_RD, &cnt.v_swtch, 0, "Context switches"); SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_trap, CTLFLAG_RD, &cnt.v_trap, 0, "Traps"); SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_syscall, CTLFLAG_RD, &cnt.v_syscall, 0, "Syscalls"); SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_intr, CTLFLAG_RD, &cnt.v_intr, 0, "Hardware interrupts"); SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_soft, CTLFLAG_RD, &cnt.v_soft, 0, "Software interrupts"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vm_faults, CTLFLAG_RD, &cnt.v_vm_faults, 0, "VM faults"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cow_faults, CTLFLAG_RD, &cnt.v_cow_faults, 0, "COW faults"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cow_optim, CTLFLAG_RD, &cnt.v_cow_optim, 0, "Optimized COW faults"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_zfod, CTLFLAG_RD, &cnt.v_zfod, 0, "Zero fill"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_ozfod, CTLFLAG_RD, &cnt.v_ozfod, 0, "Optimized zero fill"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_swapin, CTLFLAG_RD, &cnt.v_swapin, 0, "Swapin operations"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_swapout, CTLFLAG_RD, &cnt.v_swapout, 0, "Swapout operations"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_swappgsin, CTLFLAG_RD, &cnt.v_swappgsin, 0, "Swapin pages"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_swappgsout, CTLFLAG_RD, &cnt.v_swappgsout, 0, "Swapout pages"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vnodein, CTLFLAG_RD, &cnt.v_vnodein, 0, "Vnodein operations"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vnodeout, CTLFLAG_RD, &cnt.v_vnodeout, 0, "Vnodeout operations"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vnodepgsin, CTLFLAG_RD, &cnt.v_vnodepgsin, 0, "Vnodein pages"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vnodepgsout, CTLFLAG_RD, &cnt.v_vnodepgsout, 0, "Vnodeout pages"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_intrans, CTLFLAG_RD, &cnt.v_intrans, 0, "In transit page blocking"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_reactivated, CTLFLAG_RD, &cnt.v_reactivated, 0, "Reactivated pages"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLFLAG_RD, &cnt.v_pdwakeups, 0, "Pagedaemon wakeups"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_pdpages, CTLFLAG_RD, &cnt.v_pdpages, 0, "Pagedaemon page scans"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_dfree, CTLFLAG_RD, &cnt.v_dfree, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_pfree, CTLFLAG_RD, &cnt.v_pfree, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tfree, CTLFLAG_RD, &cnt.v_tfree, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_page_size, CTLFLAG_RD, &cnt.v_page_size, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_page_count, CTLFLAG_RD, &cnt.v_page_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_free_reserved, CTLFLAG_RD, &cnt.v_free_reserved, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_free_target, CTLFLAG_RD, &cnt.v_free_target, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_free_min, CTLFLAG_RD, &cnt.v_free_min, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_free_count, CTLFLAG_RD, &cnt.v_free_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_wire_count, CTLFLAG_RD, &cnt.v_wire_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_active_count, CTLFLAG_RD, &cnt.v_active_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_inactive_target, CTLFLAG_RD, &cnt.v_inactive_target, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_inactive_count, CTLFLAG_RD, &cnt.v_inactive_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD, &cnt.v_cache_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_min, CTLFLAG_RD, &cnt.v_cache_min, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_max, CTLFLAG_RD, &cnt.v_cache_max, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_pageout_free_min, CTLFLAG_RD, &cnt.v_pageout_free_min, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLFLAG_RD, &cnt.v_interrupt_free_min, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, ""); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_forks, CTLFLAG_RD, &cnt.v_forks, 0, "Number of fork() calls"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vforks, CTLFLAG_RD, &cnt.v_vforks, 0, "Number of vfork() calls"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_rforks, CTLFLAG_RD, &cnt.v_rforks, 0, "Number of rfork() calls"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_kthreads, CTLFLAG_RD, &cnt.v_kthreads, 0, "Number of fork() calls by kernel"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_forkpages, CTLFLAG_RD, &cnt.v_forkpages, 0, "VM pages affected by fork()"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_vforkpages, CTLFLAG_RD, &cnt.v_vforkpages, 0, "VM pages affected by vfork()"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_rforkpages, CTLFLAG_RD, &cnt.v_rforkpages, 0, "VM pages affected by rfork()"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_kthreadpages, CTLFLAG_RD, &cnt.v_kthreadpages, 0, "VM pages affected by fork() by kernel"); #if 0 SYSCTL_INT(_vm_stats_misc, OID_AUTO, page_mask, CTLFLAG_RD, &page_mask, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, page_shift, CTLFLAG_RD, &page_shift, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, first_page, CTLFLAG_RD, &first_page, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, last_page, CTLFLAG_RD, &last_page, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, vm_page_bucket_count, CTLFLAG_RD, &vm_page_bucket_count, 0, ""); SYSCTL_INT(_vm_stats_misc, OID_AUTO, vm_page_hash_mask, CTLFLAG_RD, &vm_page_hash_mask, 0, ""); #endif