Index: head/sys/alpha/alpha/trap.c =================================================================== --- head/sys/alpha/alpha/trap.c (revision 116360) +++ head/sys/alpha/alpha/trap.c (revision 116361) @@ -1,1307 +1,1307 @@ /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */ /* * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); /* #include "opt_fix_unaligned_vax_fp.h" */ #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef DDB #include #endif #include /* for handle_opdec() */ unsigned long Sfloat_to_reg(unsigned int); unsigned int reg_to_Sfloat(unsigned long); unsigned long Tfloat_reg_cvt(unsigned long); #ifdef FIX_UNALIGNED_VAX_FP unsigned long Ffloat_to_reg(unsigned int); unsigned int reg_to_Ffloat(unsigned long); unsigned long Gfloat_reg_cvt(unsigned long); #endif int unaligned_fixup(unsigned long, unsigned long, unsigned long, struct thread *); int handle_opdec(struct thread *td, u_int64_t *ucodep); static void printtrap(const unsigned long, const unsigned long, const unsigned long, const unsigned long, struct trapframe *, int, int); #ifdef WITNESS extern char *syscallnames[]; #endif static const char *arith_exceptions[] = { "software completion", "invalid operation", "division by zero", "overflow", "underflow", "inexact result", "integer overflow", }; static const char *instruction_faults[] = { "bpt", "bugchk", "gentrap", "FEN", "opDec" }; static const char *interrupt_types[] = { "interprocessor", "clock", "correctable error", "machine check", "I/O device", "performance counter" }; static const char *mmfault_types[] = { "translation not valid", "access violation", "fault on read", "fault on execute", "fault on write" }; static const char *mmfault_causes[] = { "instruction fetch", "load instructon", "store instruction" }; static void printtrap(a0, a1, a2, entry, framep, isfatal, user) const unsigned long a0, a1, a2, entry; struct trapframe *framep; int isfatal, user; { char ubuf[64]; const char *entryname; unsigned long i; switch (entry) { case ALPHA_KENTRY_INT: entryname = "interrupt"; break; case ALPHA_KENTRY_ARITH: entryname = "arithmetic trap"; break; case ALPHA_KENTRY_MM: entryname = "memory management fault"; break; case ALPHA_KENTRY_IF: entryname = "instruction fault"; break; case ALPHA_KENTRY_UNA: entryname = "unaligned access fault"; break; case ALPHA_KENTRY_SYS: entryname = "system call"; break; default: snprintf(ubuf, sizeof(ubuf), "type %lx", entry); entryname = (const char *) ubuf; break; } printf("\n"); printf("%s %s trap:\n", isfatal? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" trap entry = 0x%lx (%s)\n", entry, entryname); #ifdef SMP printf(" cpuid = %d\n", PCPU_GET(cpuid)); #endif switch (entry) { case ALPHA_KENTRY_INT: printf(" interrupt type = "); if (a0 < 5) { printf("%s\n", interrupt_types[a0]); if (a0 > 1) { printf(" vector = 0x%lx\n", a1); if (a0 < 3) printf(" logout area = 0x%lx\n", a2); } } else printf("0x%lx (unknown)\n", a0); break; case ALPHA_KENTRY_ARITH: printf(" exception type = "); for (i = 0; i < 7; i++) if (a0 & (1 << i)) { printf("%s", arith_exceptions[i]); if (a0 & (~0 - (1 << i))) printf(", "); } printf("\n"); printf(" register mask = 0x%lx", a1); break; case ALPHA_KENTRY_MM: printf(" faulting va = 0x%lx\n", a0); printf(" type = "); if (a1 < 5) printf("%s\n", mmfault_types[a1]); else printf("0x%lx (unknown)\n", a1); printf(" cause = "); i = a2 + 1; if (i < 3) printf("%s\n", mmfault_causes[i]); else printf("0x%lx (unknown)\n", a2); break; case ALPHA_KENTRY_IF: printf(" fault type = "); if (a0 < 5) printf("%s\n", instruction_faults[a0]); else printf("0x%lx (unknown)\n", a0); break; case ALPHA_KENTRY_UNA: printf(" faulting va = 0x%lx\n", a0); printf(" opcode = 0x%lx\n", a1); printf(" register = 0x%lx\n", a2); break; default: printf(" a0 = 0x%lx\n", a0); printf(" a1 = 0x%lx\n", a1); printf(" a2 = 0x%lx\n", a2); break; } printf(" pc = 0x%lx\n", framep->tf_regs[FRAME_PC]); printf(" ra = 0x%lx\n", framep->tf_regs[FRAME_RA]); printf(" sp = 0x%lx\n", framep->tf_regs[FRAME_SP]); if (curthread != NULL && (curthread->td_proc->p_flag & P_KTHREAD) == 0) printf(" usp = 0x%lx\n", alpha_pal_rdusp()); printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_proc->p_comm); printf("\n"); } /* * Trap is called from locore to handle most types of processor traps. * System calls are broken out for efficiency and ASTs are broken out * to make the code a bit cleaner and more representative of the * Alpha architecture. */ /*ARGSUSED*/ void trap(a0, a1, a2, entry, framep) const unsigned long a0, a1, a2, entry; struct trapframe *framep; { register struct thread *td; register struct proc *p; register int i; u_int64_t ucode; u_int sticks; int user; #ifdef SMP register_t s; #endif /* * Find our per-cpu globals. */ #ifdef SMP s = intr_disable(); #endif pcpup = (struct pcpu *) alpha_pal_rdval(); td = curthread; #ifdef SMP if (td == NULL) { printtrap(a0, a1, a2, entry, framep, 1, 0); cpu_halt(); } td->td_md.md_kernnest++; intr_restore(s); #endif p = td->td_proc; /* GIANT_REQUIRED; * Giant hasn't been acquired yet. */ cnt.v_trap++; ucode = 0; user = (framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) != 0; CTR5(KTR_TRAP, "%s trap: pid %d, (%lx, %lx, %lx)", user ? "user" : "kernel", p->p_pid, a0, a1, a2); if (user) { sticks = td->td_sticks; td->td_frame = framep; if (td->td_ucred != p->p_ucred) cred_update_thread(td); } else { sticks = 0; /* XXX bogus -Wuninitialized warning */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); } #ifdef DIAGNOSTIC if (user) alpha_fpstate_check(td); #endif switch (entry) { case ALPHA_KENTRY_UNA: /* * If user-land, do whatever fixups, printing, and * signalling is appropriate (based on system-wide * and per-process unaligned-access-handling flags). */ if (user) { mtx_lock(&Giant); i = unaligned_fixup(a0, a1, a2, td); mtx_unlock(&Giant); if (i == 0) goto out; ucode = a0; /* VA */ break; } /* * Unaligned access from kernel mode is always an error, * EVEN IF A COPY FAULT HANDLER IS SET! * * It's an error if a copy fault handler is set because * the various routines which do user-initiated copies * do so in a bcopy-like manner. In other words, the * kernel never assumes that pointers provided by the * user are properly aligned, and so if the kernel * does cause an unaligned access it's a kernel bug. */ goto dopanic; case ALPHA_KENTRY_ARITH: /* * If user-land, give a SIGFPE if software completion * is not requested or if the completion fails. */ if (user) { mtx_lock(&Giant); if (a0 & EXCSUM_SWC) if (fp_software_completion(a1, td)) { mtx_unlock(&Giant); goto out; } mtx_unlock(&Giant); i = SIGFPE; ucode = a0; /* exception summary */ break; } /* Always fatal in kernel. Should never happen. */ goto dopanic; case ALPHA_KENTRY_IF: /* * These are always fatal in kernel, and should never happen. */ if (!user) { #ifdef DDB /* * ...unless, of course, DDB is configured; BUGCHK * is used to invoke the kernel debugger, and we * might have set a breakpoint. */ if (a0 == ALPHA_IF_CODE_BUGCHK || a0 == ALPHA_IF_CODE_BPT) { if (kdb_trap(a0, a1, a2, entry, framep)) goto out; } /* * If we get here, DDB did _not_ handle the * trap, and we need to PANIC! */ #endif goto dopanic; } i = 0; switch (a0) { case ALPHA_IF_CODE_GENTRAP: if (framep->tf_regs[FRAME_A0] == -2) { /* weird! */ i = SIGFPE; ucode = a0; /* exception summary */ break; } /* FALLTHROUGH */ case ALPHA_IF_CODE_BPT: case ALPHA_IF_CODE_BUGCHK: if (td->td_md.md_flags & (MDTD_STEP1|MDTD_STEP2)) { mtx_lock(&Giant); ptrace_clear_single_step(td); td->td_frame->tf_regs[FRAME_PC] -= 4; mtx_unlock(&Giant); } ucode = a0; /* trap type */ i = SIGTRAP; break; case ALPHA_IF_CODE_OPDEC: i = handle_opdec(td, &ucode); if (i == 0) goto out; break; case ALPHA_IF_CODE_FEN: /* * on exit from the kernel, if thread == fpcurthread, * FP is enabled. */ if (PCPU_GET(fpcurthread) == td) { printf("trap: fp disabled for fpcurthread == %p", td); goto dopanic; } alpha_fpstate_switch(td); goto out; default: printf("trap: unknown IF type 0x%lx\n", a0); goto dopanic; } break; case ALPHA_KENTRY_MM: switch (a1) { case ALPHA_MMCSR_FOR: case ALPHA_MMCSR_FOE: case ALPHA_MMCSR_FOW: pmap_emulate_reference(p->p_vmspace, a0, user, a1 == ALPHA_MMCSR_FOW); goto out; case ALPHA_MMCSR_INVALTRANS: case ALPHA_MMCSR_ACCESS: { register vm_offset_t va; register struct vmspace *vm = NULL; register vm_map_t map; vm_prot_t ftype = 0; int rv; /* * If it was caused by fuswintr or suswintr, * just punt. Note that we check the faulting * address against the address accessed by * [fs]uswintr, in case another fault happens * when they are running. */ if (!user && td != NULL && td->td_pcb->pcb_onfault == (unsigned long)fswintrberr && td->td_pcb->pcb_accessaddr == a0) { framep->tf_regs[FRAME_PC] = td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = 0; goto out; } /* * It is only a kernel address space fault iff: * 1. !user and * 2. pcb_onfault not set or * 3. pcb_onfault set but kernel space data fault * The last can occur during an exec() copyin where the * argument space is lazy-allocated. * * For the purposes of the Linux emulator, we allow * kernel accesses to a small region of the * user stack which the emulator uses to * translate syscall arguments. */ if (!user && ((a0 >= VM_MIN_KERNEL_ADDRESS) || (td == NULL) || (td->td_pcb->pcb_onfault == 0))) { if (a0 >= trunc_page(PS_STRINGS - szsigcode - SPARE_USRSPACE) && a0 < round_page(PS_STRINGS - szsigcode)) { vm = p->p_vmspace; map = &vm->vm_map; } else { map = kernel_map; } } else { vm = p->p_vmspace; map = &vm->vm_map; } switch (a2) { case -1: /* instruction fetch fault */ case 0: /* load instruction */ ftype = VM_PROT_READ; break; case 1: /* store instruction */ ftype = VM_PROT_WRITE; break; #ifdef DIAGNOSTIC default: /* XXX gcc -Wuninitialized */ goto dopanic; #endif } va = trunc_page((vm_offset_t)a0); if (map != kernel_map) { /* * Keep swapout from messing with us * during thiscritical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process * locking or stacks in the kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) goto out; if (!user) { /* Check for copyin/copyout fault */ if (td != NULL && td->td_pcb->pcb_onfault != 0) { framep->tf_regs[FRAME_PC] = td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = 0; goto out; } goto dopanic; } ucode = a0; i = SIGSEGV; #ifdef DEBUG printtrap(a0, a1, a2, entry, framep, 1, user); #endif break; } default: printf("trap: unknown MMCSR value 0x%lx\n", a1); goto dopanic; } break; default: goto dopanic; } #ifdef DEBUG printtrap(a0, a1, a2, entry, framep, 1, user); #endif framep->tf_regs[FRAME_TRAPARG_A0] = a0; framep->tf_regs[FRAME_TRAPARG_A1] = a1; framep->tf_regs[FRAME_TRAPARG_A2] = a2; trapsignal(td, i, ucode); out: if (user) { framep->tf_regs[FRAME_SP] = alpha_pal_rdusp(); userret(td, framep, sticks); mtx_assert(&Giant, MA_NOTOWNED); #ifdef DIAGNOSTIC cred_free_thread(td); #endif } return; dopanic: printtrap(a0, a1, a2, entry, framep, 1, user); /* XXX dump registers */ #ifdef DDB kdb_trap(a0, a1, a2, entry, framep); #endif panic("trap"); } /* * Process a system call. * * System calls are strange beasts. They are passed the syscall number * in v0, and the arguments in the registers (as normal). They return * an error flag in a3 (if a3 != 0 on return, the syscall had an error), * and the return value (if any) in v0. * * The assembly stub takes care of moving the call number into a register * we can get to, and moves all of the argument registers into their places * in the trap frame. On return, it restores the callee-saved registers, * a3, and v0 from the frame before returning to the user process. */ void syscall(code, framep) u_int64_t code; struct trapframe *framep; { struct sysent *callp; struct thread *td; struct proc *p; int error = 0; u_int64_t opc; u_int sticks; u_int64_t args[10]; /* XXX */ u_int hidden = 0, nargs; #ifdef SMP register_t s; #endif /* * Find our per-cpu globals. */ #ifdef SMP s = intr_disable(); #endif pcpup = (struct pcpu *) alpha_pal_rdval(); td = curthread; #ifdef SMP td->td_md.md_kernnest++; intr_restore(s); #endif p = td->td_proc; framep->tf_regs[FRAME_TRAPARG_A0] = 0; framep->tf_regs[FRAME_TRAPARG_A1] = 0; framep->tf_regs[FRAME_TRAPARG_A2] = 0; #if notdef /* can't happen, ever. */ if ((framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) == 0) panic("syscall"); #endif cnt.v_syscall++; td->td_frame = framep; opc = framep->tf_regs[FRAME_PC] - 4; sticks = td->td_sticks; if (td->td_ucred != p->p_ucred) cred_update_thread(td); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_user_enter(p, td); #ifdef DIAGNOSTIC alpha_fpstate_check(td); #endif if (p->p_sysent->sv_prepsyscall) { /* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, ¶ms); */ panic("prepsyscall"); } else { /* * syscall() and __syscall() are handled the same on * the alpha, as everything is 64-bit aligned, anyway. */ if (code == SYS_syscall || code == SYS___syscall) { /* * Code is first argument, followed by actual args. */ code = framep->tf_regs[FRAME_A0]; hidden = 1; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; nargs = (callp->sy_narg & SYF_ARGMASK) + hidden; switch (nargs) { default: if (nargs > 10) /* XXX */ panic("syscall: too many args (%d)", nargs); error = copyin((caddr_t)(alpha_pal_rdusp()), &args[6], (nargs - 6) * sizeof(u_int64_t)); case 6: args[5] = framep->tf_regs[FRAME_A5]; case 5: args[4] = framep->tf_regs[FRAME_A4]; case 4: args[3] = framep->tf_regs[FRAME_A3]; case 3: args[2] = framep->tf_regs[FRAME_A2]; case 2: args[1] = framep->tf_regs[FRAME_A1]; case 1: args[0] = framep->tf_regs[FRAME_A0]; case 0: break; } #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, (callp->sy_narg & SYF_ARGMASK), args + hidden); #endif /* * Try to run the syscall without the MP lock if the syscall * is MP safe */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK)); error = (*callp->sy_call)(td, args + hidden); } switch (error) { case 0: framep->tf_regs[FRAME_V0] = td->td_retval[0]; framep->tf_regs[FRAME_A4] = td->td_retval[1]; framep->tf_regs[FRAME_A3] = 0; break; case ERESTART: framep->tf_regs[FRAME_PC] = opc; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } framep->tf_regs[FRAME_V0] = error; framep->tf_regs[FRAME_A3] = 1; break; } /* * Release Giant if we had to get it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); userret(td, framep, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } /* * Unaligned access handler. It's not clear that this can get much slower... * */ const static int reg_to_framereg[32] = { FRAME_V0, FRAME_T0, FRAME_T1, FRAME_T2, FRAME_T3, FRAME_T4, FRAME_T5, FRAME_T6, FRAME_T7, FRAME_S0, FRAME_S1, FRAME_S2, FRAME_S3, FRAME_S4, FRAME_S5, FRAME_S6, FRAME_A0, FRAME_A1, FRAME_A2, FRAME_A3, FRAME_A4, FRAME_A5, FRAME_T8, FRAME_T9, FRAME_T10, FRAME_T11, FRAME_RA, FRAME_T12, FRAME_AT, FRAME_GP, FRAME_SP, -1, }; #define irp(td, reg) \ ((reg_to_framereg[(reg)] == -1) ? NULL : \ &(td)->td_frame->tf_regs[reg_to_framereg[(reg)]]) #define frp(td, reg) \ (&(td)->td_pcb->pcb_fp.fpr_regs[(reg)]) #define unaligned_load(storage, ptrf, mod) \ if (copyin((caddr_t)va, &(storage), sizeof (storage)) == 0 && \ (regptr = ptrf(td, reg)) != NULL) \ signal = 0; \ else \ break; \ *regptr = mod (storage); #define unaligned_store(storage, ptrf, mod) \ if ((regptr = ptrf(td, reg)) == NULL) \ (storage) = 0; \ else \ (storage) = mod (*regptr); \ if (copyout(&(storage), (caddr_t)va, sizeof (storage)) == 0) \ signal = 0; \ else \ break; #define unaligned_load_integer(storage) \ unaligned_load(storage, irp, ) #define unaligned_store_integer(storage) \ unaligned_store(storage, irp, ) #define unaligned_load_floating(storage, mod) \ alpha_fpstate_save(td, 1); \ unaligned_load(storage, frp, mod) #define unaligned_store_floating(storage, mod) \ alpha_fpstate_save(td, 0); \ unaligned_store(storage, frp, mod) unsigned long Sfloat_to_reg(s) unsigned int s; { unsigned long sign, expn, frac; unsigned long result; sign = (s & 0x80000000) >> 31; expn = (s & 0x7f800000) >> 23; frac = (s & 0x007fffff) >> 0; /* map exponent part, as appropriate. */ if (expn == 0xff) expn = 0x7ff; else if ((expn & 0x80) != 0) expn = (0x400 | (expn & ~0x80)); else if ((expn & 0x80) == 0 && expn != 0) expn = (0x380 | (expn & ~0x80)); result = (sign << 63) | (expn << 52) | (frac << 29); return (result); } unsigned int reg_to_Sfloat(r) unsigned long r; { unsigned long sign, expn, frac; unsigned int result; sign = (r & 0x8000000000000000) >> 63; expn = (r & 0x7ff0000000000000) >> 52; frac = (r & 0x000fffffe0000000) >> 29; /* map exponent part, as appropriate. */ expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00); result = (sign << 31) | (expn << 23) | (frac << 0); return (result); } /* * Conversion of T floating datums to and from register format * requires no bit reordering whatsoever. */ unsigned long Tfloat_reg_cvt(input) unsigned long input; { return (input); } #ifdef FIX_UNALIGNED_VAX_FP unsigned long Ffloat_to_reg(f) unsigned int f; { unsigned long sign, expn, frlo, frhi; unsigned long result; sign = (f & 0x00008000) >> 15; expn = (f & 0x00007f80) >> 7; frhi = (f & 0x0000007f) >> 0; frlo = (f & 0xffff0000) >> 16; /* map exponent part, as appropriate. */ if ((expn & 0x80) != 0) expn = (0x400 | (expn & ~0x80)); else if ((expn & 0x80) == 0 && expn != 0) expn = (0x380 | (expn & ~0x80)); result = (sign << 63) | (expn << 52) | (frhi << 45) | (frlo << 29); return (result); } unsigned int reg_to_Ffloat(r) unsigned long r; { unsigned long sign, expn, frhi, frlo; unsigned int result; sign = (r & 0x8000000000000000) >> 63; expn = (r & 0x7ff0000000000000) >> 52; frhi = (r & 0x000fe00000000000) >> 45; frlo = (r & 0x00001fffe0000000) >> 29; /* map exponent part, as appropriate. */ expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00); result = (sign << 15) | (expn << 7) | (frhi << 0) | (frlo << 16); return (result); } /* * Conversion of G floating datums to and from register format is * symmetrical. Just swap shorts in the quad... */ unsigned long Gfloat_reg_cvt(input) unsigned long input; { unsigned long a, b, c, d; unsigned long result; a = (input & 0x000000000000ffff) >> 0; b = (input & 0x00000000ffff0000) >> 16; c = (input & 0x0000ffff00000000) >> 32; d = (input & 0xffff000000000000) >> 48; result = (a << 48) | (b << 32) | (c << 16) | (d << 0); return (result); } #endif /* FIX_UNALIGNED_VAX_FP */ extern int alpha_unaligned_print, alpha_unaligned_fix; extern int alpha_unaligned_sigbus; struct unaligned_fixup_data { const char *type; /* opcode name */ int size; /* size, 0 if fixup not supported */ }; int unaligned_fixup(va, opcode, reg, td) unsigned long va, opcode, reg; struct thread *td; { int doprint, dofix, dosigbus; int signal, size; const char *type; struct proc *p; unsigned long *regptr, longdata, uac; int intdata; /* signed to get extension when storing */ u_int16_t worddata; /* unsigned to _avoid_ extension */ const struct unaligned_fixup_data tab_0c[0x2] = { { "ldwu", 2 }, { "stw", 2 }, }; const struct unaligned_fixup_data tab_20[0x10] = { #ifdef FIX_UNALIGNED_VAX_FP { "ldf", 4 }, { "ldg", 8 }, #else { "ldf", 0 }, { "ldg", 0 }, #endif { "lds", 4 }, { "ldt", 8 }, #ifdef FIX_UNALIGNED_VAX_FP { "stf", 4 }, { "stg", 8 }, #else { "stf", 0 }, { "stg", 0 }, #endif { "sts", 4 }, { "stt", 8 }, { "ldl", 4 }, { "ldq", 8 }, { "ldl_l", 0 }, { "ldq_l", 0 }, /* can't fix */ { "stl", 4 }, { "stq", 8 }, { "stl_c", 0 }, { "stq_c", 0 }, /* can't fix */ }; /* * Figure out what actions to take. * */ if (td) { p = td->td_proc; uac = p->p_md.md_uac; } else { uac = 0; p = NULL; } doprint = alpha_unaligned_print && !(uac & MDP_UAC_NOPRINT); dofix = alpha_unaligned_fix && !(uac & MDP_UAC_NOFIX); dosigbus = alpha_unaligned_sigbus | (uac & MDP_UAC_SIGBUS); /* * Find out which opcode it is. Arrange to have the opcode * printed if it's an unknown opcode. */ if (opcode >= 0x0c && opcode <= 0x0d) { type = tab_0c[opcode - 0x0c].type; size = tab_0c[opcode - 0x0c].size; } else if (opcode >= 0x20 && opcode <= 0x2f) { type = tab_20[opcode - 0x20].type; size = tab_20[opcode - 0x20].size; } else { type = "0x%lx"; size = 0; } /* * See if the user can access the memory in question. * Even if it's an unknown opcode, SEGV if the access * should have failed. */ if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) { signal = SIGSEGV; goto out; } /* * If we're supposed to be noisy, squawk now. */ if (doprint) { uprintf( "pid %d (%s): unaligned access: va=0x%lx pc=0x%lx ra=0x%lx op=", p->p_pid, p->p_comm, va, td->td_frame->tf_regs[FRAME_PC], td->td_frame->tf_regs[FRAME_RA]); uprintf(type,opcode); uprintf("\n"); } /* * If we should try to fix it and know how, give it a shot. * * We never allow bad data to be unknowingly used by the * user process. That is, if we decide not to fix up an * access we cause a SIGBUS rather than letting the user * process go on without warning. * * If we're trying to do a fixup, we assume that things * will be botched. If everything works out OK, * unaligned_{load,store}_* clears the signal flag. */ signal = SIGBUS; if (dofix && size != 0) { switch (opcode) { case 0x0c: /* ldwu */ /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ unaligned_load_integer(worddata); break; case 0x0d: /* stw */ /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ unaligned_store_integer(worddata); break; #ifdef FIX_UNALIGNED_VAX_FP case 0x20: /* ldf */ unaligned_load_floating(intdata, Ffloat_to_reg); break; case 0x21: /* ldg */ unaligned_load_floating(longdata, Gfloat_reg_cvt); break; #endif case 0x22: /* lds */ unaligned_load_floating(intdata, Sfloat_to_reg); break; case 0x23: /* ldt */ unaligned_load_floating(longdata, Tfloat_reg_cvt); break; #ifdef FIX_UNALIGNED_VAX_FP case 0x24: /* stf */ unaligned_store_floating(intdata, reg_to_Ffloat); break; case 0x25: /* stg */ unaligned_store_floating(longdata, Gfloat_reg_cvt); break; #endif case 0x26: /* sts */ unaligned_store_floating(intdata, reg_to_Sfloat); break; case 0x27: /* stt */ unaligned_store_floating(longdata, Tfloat_reg_cvt); break; case 0x28: /* ldl */ unaligned_load_integer(intdata); break; case 0x29: /* ldq */ unaligned_load_integer(longdata); break; case 0x2c: /* stl */ unaligned_store_integer(intdata); break; case 0x2d: /* stq */ unaligned_store_integer(longdata); break; #ifdef DIAGNOSTIC default: panic("unaligned_fixup: can't get here"); #endif } } /* * Force SIGBUS if requested. */ if (dosigbus) signal = SIGBUS; out: return (signal); } /* * Reserved/unimplemented instruction (opDec fault) handler * * Argument is the process that caused it. No useful information * is passed to the trap handler other than the fault type. The * address of the instruction that caused the fault is 4 less than * the PC stored in the trap frame. * * If the instruction is emulated successfully, this function returns 0. * Otherwise, this function returns the signal to deliver to the process, * and fills in *ucodep with the code to be delivered. */ int handle_opdec(td, ucodep) struct thread *td; u_int64_t *ucodep; { alpha_instruction inst; register_t *regptr, memaddr; u_int64_t inst_pc; int sig; /* * Read USP into frame in case it's going to be used or modified. * This keeps us from having to check for it in lots of places * later. */ td->td_frame->tf_regs[FRAME_SP] = alpha_pal_rdusp(); inst_pc = memaddr = td->td_frame->tf_regs[FRAME_PC] - 4; if (copyin((caddr_t)inst_pc, &inst, sizeof (inst)) != 0) { /* * really, this should never happen, but in case it * does we handle it. */ printf("WARNING: handle_opdec() couldn't fetch instruction\n"); goto sigsegv; } switch (inst.generic_format.opcode) { case op_ldbu: case op_ldwu: case op_stw: case op_stb: regptr = irp(td, inst.mem_format.rs); if (regptr != NULL) memaddr = *regptr; else memaddr = 0; memaddr += inst.mem_format.displacement; regptr = irp(td, inst.mem_format.rd); if (inst.mem_format.opcode == op_ldwu || inst.mem_format.opcode == op_stw) { if (memaddr & 0x01) { sig = unaligned_fixup(memaddr, inst.mem_format.opcode, inst.mem_format.rd, td); if (sig) goto unaligned_fixup_sig; break; } } if (inst.mem_format.opcode == op_ldbu) { u_int8_t b; /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ if (copyin((caddr_t)memaddr, &b, sizeof (b)) != 0) goto sigsegv; if (regptr != NULL) *regptr = b; } else if (inst.mem_format.opcode == op_ldwu) { u_int16_t w; /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ if (copyin((caddr_t)memaddr, &w, sizeof (w)) != 0) goto sigsegv; if (regptr != NULL) *regptr = w; } else if (inst.mem_format.opcode == op_stw) { u_int16_t w; /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ w = (regptr != NULL) ? *regptr : 0; if (copyout(&w, (caddr_t)memaddr, sizeof (w)) != 0) goto sigsegv; } else if (inst.mem_format.opcode == op_stb) { u_int8_t b; /* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */ b = (regptr != NULL) ? *regptr : 0; if (copyout(&b, (caddr_t)memaddr, sizeof (b)) != 0) goto sigsegv; } break; case op_intmisc: if (inst.operate_generic_format.function == op_sextb && inst.operate_generic_format.ra == 31) { int8_t b; if (inst.operate_generic_format.is_lit) { b = inst.operate_lit_format.literal; } else { if (inst.operate_reg_format.sbz != 0) goto sigill; regptr = irp(td, inst.operate_reg_format.rt); b = (regptr != NULL) ? *regptr : 0; } regptr = irp(td, inst.operate_generic_format.rc); if (regptr != NULL) *regptr = b; break; } if (inst.operate_generic_format.function == op_sextw && inst.operate_generic_format.ra == 31) { int16_t w; if (inst.operate_generic_format.is_lit) { w = inst.operate_lit_format.literal; } else { if (inst.operate_reg_format.sbz != 0) goto sigill; regptr = irp(td, inst.operate_reg_format.rt); w = (regptr != NULL) ? *regptr : 0; } regptr = irp(td, inst.operate_generic_format.rc); if (regptr != NULL) *regptr = w; break; } goto sigill; default: goto sigill; } /* * Write back USP. Note that in the error cases below, * nothing will have been successfully modified so we don't * have to write it out. */ alpha_pal_wrusp(td->td_frame->tf_regs[FRAME_SP]); return (0); sigill: *ucodep = ALPHA_IF_CODE_OPDEC; /* trap type */ return (SIGILL); sigsegv: sig = SIGSEGV; td->td_frame->tf_regs[FRAME_PC] = inst_pc; /* re-run instr. */ unaligned_fixup_sig: *ucodep = memaddr; /* faulting address */ return (sig); } Index: head/sys/alpha/linux/linux_sysvec.c =================================================================== --- head/sys/alpha/linux/linux_sysvec.c (revision 116360) +++ head/sys/alpha/linux/linux_sysvec.c (revision 116361) @@ -1,287 +1,287 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #ifndef COMPAT_43 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #undef szsigcode MODULE_VERSION(linux, 1); MODULE_DEPEND(linux, osf1, 1, 1, 1); MODULE_DEPEND(linux, sysvmsg, 1, 1, 1); MODULE_DEPEND(linux, sysvsem, 1, 1, 1); MODULE_DEPEND(linux, sysvshm, 1, 1, 1); MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *iparams); static int elf_linux_fixup(register_t **stack_base, struct image_params *imgp) { Elf64_Auxargs *args; register_t *pos; KASSERT(curthread->td_proc == imgp->proc && - (curthread->td_proc->p_flag & P_THREADED) == 0, + (curthread->td_proc->p_flag & P_SA) == 0, ("unsafe elf_linux_fixup(), should be curproc")); args = (Elf64_Auxargs *)imgp->auxargs; pos = *stack_base + (imgp->argc + imgp->envc + 2); if (args->trace) AUXARGS_ENTRY(pos, AT_DEBUG, 1); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; **stack_base = (register_t)imgp->argc; return 0; } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(imgp) struct image_params *imgp; { const char *head; int error; head = (const char *)imgp->image_header; error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds * attempt to use the alternate path for the interpreter. If * an alternate path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { char *rpath = NULL; linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL, imgp->interpreter_name, &rpath, 0); if (rpath != imgp->interpreter_name) { int len = strlen(rpath) + 1; if (len <= MAXSHELLCMDLEN) { memcpy(imgp->interpreter_name, rpath, len); } free(rpath, M_TEMP); } } } return(error); } /* * To maintain OSF/1 compat, linux uses BSD signals & errnos on their * alpha port. This greatly simplfies things for us. */ struct sysentvec elf_linux_sysvec = { LINUX_SYS_MAXSYSCALL, linux_sysent, 0, 0, NULL, 0, NULL, NULL, elf_linux_fixup, osendsig, linux_sigcode, &linux_szsigcode, NULL, "Linux ELF", elf64_coredump, exec_linux_imgact_try, LINUX_MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs }; static Elf64_Brandinfo linux_brand = { ELFOSABI_LINUX, EM_ALPHA, "Linux", "/compat/linux", "/lib/ld-linux.so.1", &elf_linux_sysvec }; static Elf64_Brandinfo linux_glibc2brand = { ELFOSABI_LINUX, EM_ALPHA, "Linux", "/compat/linux", "/lib/ld-linux.so.2", &elf_linux_sysvec }; Elf64_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); linux_mib_destroy(); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: break; } return error; } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DUMMY(rt_sigreturn); DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c (revision 116360) +++ head/sys/amd64/amd64/pmap.c (revision 116361) @@ -1,3005 +1,3005 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD$ */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_msgbuf.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static pt_entry_t protection_codes[8]; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int nkpt; static int ndmpdp; vm_offset_t kernel_vm_end; static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *ptmmap; caddr_t CADDR1 = 0, ptvmmap = 0; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static pt_entry_t *pt_crashdumpmap; static caddr_t crashdumpmap; static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void amd64_protection_init(void); static __inline void pmap_changebit(vm_page_t m, int bit, boolean_t setem); static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex); static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 2MB. This is used to help improve performance * by using a large (2MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return newaddr; } /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return va >> PDRSHIFT; } /* Return various clipped indexes for a given VA */ static __inline vm_pindex_t pmap_pte_index(vm_offset_t va) { return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pde_index(vm_offset_t va) { return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pdpe_index(vm_offset_t va) { return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); } static __inline vm_pindex_t pmap_pml4e_index(vm_offset_t va) { return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { if (!pmap) return NULL; return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pml4e = pmap_pml4e(pmap, va); if (pml4e == NULL || (*pml4e & PG_V) == 0) return NULL; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return NULL; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t *pte; pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return NULL; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); return (PTmap + (amd64_btop(va) & mask)); } static u_int64_t allocpages(int n) { u_int64_t ret; ret = avail_start; bzero((void *)ret, n * PAGE_SIZE); avail_start += n * PAGE_SIZE; return (ret); } static void create_pagetables(void) { int i; /* Allocate pages */ KPTphys = allocpages(NKPT); KPML4phys = allocpages(1); KPDPphys = allocpages(NKPML4E); KPDphys = allocpages(NKPDPE); ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; if (ndmpdp < 1) ndmpdp = 1; DMPDPphys = allocpages(NDMPML4E); DMPDphys = allocpages(ndmpdp); /* Fill in the underlying page table pages */ /* Read-only from zero to physfree */ /* XXX not fully used, underneath 2M pages */ for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) { ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V; } /* Now map the page tables at their location within PTmap */ for (i = 0; i < NKPT; i++) { ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; } #if 0 /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < avail_start; i++) { ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS; } #endif /* And connect up the PD to the PDP */ for (i = 0; i < NKPDPE; i++) { ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; } /* Now set up the direct map space using 2MB pages */ for (i = 0; i < NPDEPG * ndmpdp; i++) { ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS; } /* And the direct map space's PDP */ for (i = 0; i < ndmpdp; i++) { ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT); ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; /* Connect the Direct Map slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; /* Connect the KVA slot up to the PML4 */ ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr) vm_paddr_t *firstaddr; { vm_offset_t va; pt_entry_t *pte; avail_start = *firstaddr; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(); *firstaddr = avail_start; virtual_avail = (vm_offset_t) KERNBASE + avail_start; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* XXX do %cr0 as well */ load_cr4(rcr4() | CR4_PGE | CR4_PSE); load_cr3(KPML4phys); /* * Initialize protection array. */ amd64_protection_init(); /* * Initialize the kernel pmap (which is statically allocated). */ kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1 is only used for the memory test. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) /* * Crashdump maps. */ SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) virtual_avail = va; *CMAP1 = 0; invltlb(); } static void * pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { *flags = UMA_SLAB_PRIV; return (void *)kmem_alloc(kernel_map, bytes); } void * uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { static vm_pindex_t colour; vm_page_t m; int pflags; void *va; *flags = UMA_SLAB_PRIV; if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT) pflags = VM_ALLOC_INTERRUPT; else pflags = VM_ALLOC_SYSTEM; if (wait & M_ZERO) pflags |= VM_ALLOC_ZERO; for (;;) { m = vm_page_alloc(NULL, colour++, pflags | VM_ALLOC_NOOBJ); if (m == NULL) { if (wait & M_NOWAIT) return (NULL); else VM_WAIT; } else break; } va = (void *)PHYS_TO_DMAP(m->phys_addr); if ((m->flags & PG_ZERO) == 0) pagezero(va); return (va); } void uma_small_free(void *mem, int size, u_int8_t flags) { vm_page_t m; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem)); vm_page_lock_queues(); vm_page_free(m); vm_page_unlock_queues(); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_paddr_t phys_start, phys_end; { int i; int initial_pvs; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ initial_pvs = vm_page_array_size; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); uma_zone_set_allocf(pvzone, pmap_pv_allocf); uma_prealloc(pvzone, initial_pvs); /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } /* * Normal invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde, *pdep; if (pmap == 0) return 0; pdep = pmap_pde(pmap, va); if (pdep) { pde = *pdep; if (pde) { if ((pde & PG_PS) != 0) { rtval = (pde & ~PDRMASK) | (va & PDRMASK); return rtval; } pte = pmap_pte(pmap, va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } } return 0; } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pde = pmap_pde(kernel_pmap, va); if (*pde & PG_PS) { pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1)); } else { pa = *vtopte(va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return pa; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | PG_G); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pplookp")) goto retry; vm_page_unlock_queues(); } return m; } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE)); *pml4 = 0; } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex); *pd = 0; } --pmap->pm_stats.resident_count; if (m->pindex < NUPDE) { /* Unhold the PD page */ vm_page_t pdpg; pdpg = vm_page_lookup(pmap->pm_pteobj, NUPDE + pmap_pdpe_index(va)); while (vm_page_sleep_if_busy(pdpg, FALSE, "pulook")) vm_page_lock_queues(); vm_page_unhold(pdpg); if (pdpg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, pdpg); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* Unhold the PDP page */ vm_page_t pdppg; pdppg = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pmap_pml4e_index(va)); while (vm_page_sleep_if_busy(pdppg, FALSE, "pulooK")) vm_page_lock_queues(); vm_page_unhold(pdppg); if (pdppg->hold_count == 0) _pmap_unwire_pte_hold(pmap, va, pdppg); } if (pmap_is_current(pmap)) { /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pmap_invalidate_page(pmap, pteva); } /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, va, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { vm_pindex_t ptepindex; if (va >= VM_MAXUSER_ADDRESS) return 0; if (mpte == NULL) { ptepindex = pmap_pde_pindex(va); if (pmap->pm_pteobj->root && pmap->pm_pteobj->root->pindex == ptepindex) { mpte = pmap->pm_pteobj->root; } else { while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL && vm_page_sleep_if_busy(mpte, FALSE, "pulook")) vm_page_lock_queues(); } } return pmap_unwire_pte_hold(pmap, va, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t pml4pg; /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + NUPML4E + 1); /* * allocate the page directory page */ pml4pg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + NUPML4E, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); vm_page_lock_queues(); vm_page_flag_clear(pml4pg, PG_BUSY); pml4pg->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); if ((pml4pg->flags & PG_ZERO) == 0) bzero(pmap->pm_pml4, PAGE_SIZE); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; /* install self-referential address mapping entry(s) */ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Wire in kernel global address entries. To avoid a race condition * between pmap initialization and pmap_growkernel, this procedure * should be called after the vmspace is attached to the process * but before this pmap is activated. */ void pmap_pinit2(pmap) struct pmap *pmap; { /* XXX: Remove this stub when no longer called */ } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; vm_pindex_t ptepindex; { vm_page_t m, pdppg, pdpg; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ _pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index); } else { /* Add reference to pdp page */ pdppg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pml4index); pdppg->hold_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ _pmap_allocpte(pmap, NUPDE + pdpindex); pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ _pmap_allocpte(pmap, NUPDE + pdpindex); } else { /* Add reference to the pd page */ pdpg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + pdpindex); pdpg->hold_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_lock_queues(); m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(m, PG_ZERO); vm_page_wakeup(m); vm_page_unlock_queues(); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { vm_pindex_t ptepindex; pd_entry_t *pd; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { *pd = 0; pd = 0; pmap_invalidate_all(kernel_pmap); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != 0 && (*pd & PG_V) != 0) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_pteobj->root && (pmap->pm_pteobj->root->pindex == ptepindex)) { m = pmap->pm_pteobj->root; } else { m = pmap_page_lookup(pmap->pm_pteobj, ptepindex); } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_allocpte(pmap, ptepindex); return m; } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_object_t object; vm_page_t m; object = pmap->pm_pteobj; KASSERT(object->ref_count == 1, ("pmap_release: pteobj reference count %d != 1", object->ref_count)); KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); vm_page_lock_queues(); while ((m = TAILQ_FIRST(&object->memq)) != NULL) { m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_busy(m); vm_page_free(m); } KASSERT(TAILQ_EMPTY(&object->memq), ("pmap_release: leaking page table pages")); vm_page_unlock_queues(); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { int s; vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; s = splhigh(); mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); while (kernel_vm_end < addr) { if ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; int s; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) { int s; pv_entry_t pv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m; oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; pte = pmap_pte(pmap, va); if (pte == NULL || (*pte & PG_V) == 0) return; pmap_remove_pte(pmap, pte, va); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr, *pde; pt_entry_t *pte; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva); return; } } anyvalid = 0; for (; sva < eva; sva = pdnxt) { if (pmap->pm_stats.resident_count == 0) break; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; pde = pmap_pde(pmap, sva); if (pde == 0) continue; ptpaddr = *pde; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { *pde = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { pte = pmap_pte(pmap, sva); if (pte == NULL || *pte == 0) continue; anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva)) break; } } if (anyvalid) pmap_invalidate_all(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte(pv->pv_pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr, *pde; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; pde = pmap_pde(pmap, sva); if (pde == NULL) continue; ptpaddr = *pde; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { *pde &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged = 1; continue; } if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { pt_entry_t pbits; pt_entry_t *pte; vm_page_t m; pte = pmap_pte(pmap, sva); if (pte == NULL) continue; pbits = *pte; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0 && pmap_track_modified(sva)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits); vm_page_dirty(m); pbits &= ~PG_M; } } pbits &= ~PG_RW; if (pbits != *pte) { pte_store(pte, pbits); anychanged = 1; } } } if (anychanged) pmap_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; register pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n", origpte, va); } } #endif pte = pmap_pte(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) panic("pmap_enter: invalid page directory va=%#lx\n", va); pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 2MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { pte_store(pte, origpte | PG_RW); pmap_invalidate_page(pmap, va); } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); err = pmap_remove_pte(pmap, pte, va); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%lx", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { pte_store(pte, newpte | PG_A); /*if (origpte)*/ { pmap_invalidate_page(pmap, va); } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 2MB page"); if (pmap->pm_pteobj->root && (pmap->pm_pteobj->root->pindex == ptepindex)) { mpte = pmap->pm_pteobj->root; } else { mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, va, mpte); vm_page_unlock_queues(); } return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int limit) { vm_pindex_t tmpidx; int psize; vm_page_t p, mpte; if (pmap == NULL || object == NULL) return; /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if ((object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; int npdes; pd_entry_t ptepa, *pde; pde = pmap_pde(pmap, addr); if (pde != 0 && (*pde & PG_V) != 0) return; retry: p = vm_page_lookup(object, pindex); if (p != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; pde++; } pmap_invalidate_all(kernel_pmap); return; } psize = amd64_btop(size); if ((object->type != OBJT_VNODE) || ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { return; } if (psize + pindex > object->size) { if (object->size < pindex) return; psize = object->size - pindex; } mpte = NULL; if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < pindex) { p = vm_page_splay(pindex, object->root); if ((object->root = p)->pindex < pindex) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if ((limit & MAP_PREFAULT_MADVISE) && cnt.v_free_count < cnt.v_free_reserved) { break; } vm_page_lock_queues(); if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); vm_page_unlock_queues(); mpte = pmap_enter_quick(pmap, addr + amd64_ptob(tmpidx), p, mpte); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); } } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -1 * PAGE_SIZE, 1 * PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE, -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; pd_entry_t *pde; if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; pt_entry_t *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; pde = pmap_pde(pmap, addr); if (pde == NULL || (*pde & PG_V) == 0) continue; pte = vtopte(addr); if ((*pte & PG_V) == 0) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; vm_page_lock_queues(); if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } vm_page_busy(m); vm_page_unlock_queues(); mpte = pmap_enter_quick(pmap, addr, m, mpte); vm_page_lock_queues(); vm_page_wakeup(m); } vm_page_unlock_queues(); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pte = pmap_pte(pmap, va); if (wired && (*pte & PG_W) == 0) { pmap->pm_stats.wired_count++; *pte |= PG_W; } else if (!wired && (*pte & PG_W) != 0) { pmap->pm_stats.wired_count--; *pte &= ~PG_W; } } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; vm_page_t m; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr, *pde; vm_pindex_t ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = (addr + NBPDR) & ~PDRMASK; ptepindex = pmap_pde_pindex(addr); pde = pmap_pde(src_pmap, addr); if (pde) srcptepaddr = *pde; else continue; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { pde = pmap_pde(dst_pmap, addr); if (pde == 0) { /* * XXX should do an allocpte here to * instantiate the pde */ continue; } if (*pde == 0) { *pde = srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); dst_pte = pmap_pte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); } else { vm_page_lock_queues(); pmap_unwire_pte_hold(dst_pmap, addr, dstmpte); vm_page_unlock_queues(); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; } } } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); bcopy((void *)src, (void *)dst, PAGE_SIZE); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } loops++; if (loops >= 16) break; } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; if (tpte == 0) { printf("TPTE at %p IS ZERO @ VA %08lx\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } m = PHYS_TO_VM_PAGE(tpte); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pv->pv_pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); pmap_invalidate_all(pmap); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (*pte & PG_M) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(vm_page_t m, int bit, boolean_t setem) { register pv_entry_t pv; register pt_entry_t *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) return; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (setem) { *pte |= bit; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } else { pt_entry_t pbits = *pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } pte_store(pte, pbits & ~(PG_M|PG_RW)); } else { pte_store(pte, pbits & ~bit); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } } } if (!setem && bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_RW, FALSE); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; pt_entry_t *pte; pt_entry_t v; int s; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { pte_store(pte, v & ~PG_A); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { break; } } } while ((pv = pvn) != NULL && pv != pvf); } splx(s); return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_changebit(m, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_changebit(m, PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void amd64_protection_init() { register long *kp, prot; #if 0 #define PG_NX (1ul << 63) #else #define PG_NX 0 #endif kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: *kp++ = PG_NX; break; case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: *kp++ = PG_RW | PG_NX; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_paddr_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0; ) { pmap_kenter(tmpva, pa); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset, tmpva; pt_entry_t *pte; base = va & PG_FRAME; offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pte = vtopte(tmpva); pte_clear(pte); } pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte(pmap, addr); if (ptep == 0) { return 0; } if ((pte = *ptep) != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { struct proc *p = td->td_proc; pmap_t pmap; u_int64_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); pmap->pm_active |= PCPU_GET(cpumask); cr3 = vtophys(pmap->pm_pml4); /* XXXKSE this is wrong. * pmap_activate is for the current thread on the current cpu */ - if (p->p_flag & P_THREADED) { + if (p->p_flag & P_SA) { /* Make sure all other cr3 entries are updated. */ /* what if they are running? XXXKSE (maybe abort them) */ FOREACH_THREAD_IN_PROC(p, td) { td->td_pcb->pcb_cr3 = cr3; } } else { td->td_pcb->pcb_cr3 = cr3; } load_cr3(cr3); critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c (revision 116360) +++ head/sys/amd64/amd64/trap.c (revision 116361) @@ -1,817 +1,817 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * $FreeBSD$ */ /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_isa.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern void trap(struct trapframe frame); extern void syscall(struct trapframe frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #ifdef DDB static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif #ifdef DEVICE_POLLING extern u_int32_t poll_in_trap; extern int ether_poll(int count); #endif /* DEVICE_POLLING */ /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct thread *td = curthread; struct proc *p = td->td_proc; u_int sticks = 0; int i = 0, ucode = 0, type, code; atomic_add_int(&cnt.v_trap, 1); type = frame.tf_trapno; #ifdef DDB if (db_active) { vm_offset_t eva; eva = (type == T_PAGEFLT ? frame.tf_addr : 0); trap_fatal(&frame, eva); goto out; } #endif if ((frame.tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame.tf_cs) == SEL_UPL) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while in a critical * section. */ if (td->td_critnest == 0) enable_intr(); } } code = frame.tf_err; if (type == T_PAGEFLT) { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. */ if (td->td_critnest != 0) trap_fatal(&frame, frame.tf_addr); } #ifdef DEVICE_POLLING if (poll_in_trap) ether_poll(poll_in_trap); #endif /* DEVICE_POLLING */ if (ISPL(frame.tf_cs) == SEL_UPL) { /* user trap */ sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_rflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = npxtrap(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE); goto out; case T_DNA: /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) { printf("npxdna in kernel mode!\n"); goto out; } break; case T_STKFLT: /* stack fault */ break; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_rip == (long)doreti_iret) { frame.tf_rip = (long)doreti_iret_fault; goto out; } if (PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_rflags & PSL_NT) { frame.tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB /* XXX Giant */ if (kdb_trap (type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* DEV_ISA */ } trap_fatal(&frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(td, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", frame.tf_addr); uprintf("\n"); } #endif user: userret(td, &frame, sticks); mtx_assert(&Giant, MA_NOTOWNED); userout: #ifdef DIAGNOSTIC cred_free_thread(td); #endif out: return; } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss; long esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); if (ISPL(frame->tf_cs) == SEL_UPL) { ss = frame->tf_ss & 0xffff; esp = frame->tf_rsp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (long)&frame->tf_rsp; } printf("stack pointer = 0x%x:0x%lx\n", ss, esp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler() { printf("\nFatal double fault\n"); panic("double fault"); } /* * syscall - system call request C handler * * A system call is essentially treated as a trap. */ void syscall(frame) struct trapframe frame; { caddr_t params; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_tf_rflags; u_int sticks; int error; int narg; register_t args[8]; register_t *argp; u_int code; int reg, regcnt; /* * note: PCPU_LAZY_INC() can only be used if we can afford * occassional inaccuracy in the count. */ PCPU_LAZY_INC(cnt.v_syscall); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); /* try to stabilize the system XXX */ panic("syscall"); /* NOT REACHED */ mtx_unlock(&Giant); } #endif reg = 0; regcnt = 6; sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_user_enter(p, td); params = (caddr_t)frame.tf_rsp + sizeof(register_t); code = frame.tf_rax; orig_tf_rflags = frame.tf_rflags; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(&frame, (int *)args, &code, ¶ms); } else { if (code == SYS_syscall || code == SYS___syscall) { code = frame.tf_rdi; reg++; regcnt--; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ if (narg <= regcnt) { argp = &frame.tf_rdi; argp += reg; error = 0; } else { KASSERT(narg <= sizeof(args) / sizeof(args[0]), ("Too many syscall arguments!")); KASSERT(params != NULL, ("copyin args with no params!")); argp = &frame.tf_rdi; argp += reg; bcopy(argp, args, sizeof(args[0]) * regcnt); error = copyin(params, &args[regcnt], (narg - regcnt) * sizeof(args[0])); argp = &args[0]; } #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, argp); #endif /* * Try to run the syscall without Giant if the syscall * is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame.tf_rdx; STOPEVENT(p, S_SCE, narg); error = (*callp->sy_call)(td, argp); } switch (error) { case 0: frame.tf_rax = td->td_retval[0]; frame.tf_rdx = td->td_retval[1]; frame.tf_rflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes. * We have to do a full context restore so that %r10 * (which was holding the value of %rcx) is restored for * the next iteration. */ frame.tf_rip -= frame.tf_err; frame.tf_r10 = frame.tf_rcx; td->td_pcb->pcb_flags |= PCB_FULLCTX; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_rax = error; frame.tf_rflags |= PSL_C; break; } /* * Release Giant if we previously set it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); /* * Traced syscall. */ if (orig_tf_rflags & PSL_T) { frame.tf_rflags &= ~PSL_T; trapsignal(td, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(td, &frame, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/compat/svr4/svr4_sysvec.c =================================================================== --- head/sys/compat/svr4/svr4_sysvec.c (revision 116360) +++ head/sys/compat/svr4/svr4_sysvec.c (revision 116361) @@ -1,421 +1,421 @@ /* * Copyright (c) 1998 Mark Newton * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christos Zoulas. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #ifndef COMPAT_43 #error "Unable to compile SVR4-emulator due to missing COMPAT_43 option!" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int bsd_to_svr4_errno[ELAST+1] = { 0, SVR4_EPERM, SVR4_ENOENT, SVR4_ESRCH, SVR4_EINTR, SVR4_EIO, SVR4_ENXIO, SVR4_E2BIG, SVR4_ENOEXEC, SVR4_EBADF, SVR4_ECHILD, SVR4_EDEADLK, SVR4_ENOMEM, SVR4_EACCES, SVR4_EFAULT, SVR4_ENOTBLK, SVR4_EBUSY, SVR4_EEXIST, SVR4_EXDEV, SVR4_ENODEV, SVR4_ENOTDIR, SVR4_EISDIR, SVR4_EINVAL, SVR4_ENFILE, SVR4_EMFILE, SVR4_ENOTTY, SVR4_ETXTBSY, SVR4_EFBIG, SVR4_ENOSPC, SVR4_ESPIPE, SVR4_EROFS, SVR4_EMLINK, SVR4_EPIPE, SVR4_EDOM, SVR4_ERANGE, SVR4_EAGAIN, SVR4_EINPROGRESS, SVR4_EALREADY, SVR4_ENOTSOCK, SVR4_EDESTADDRREQ, SVR4_EMSGSIZE, SVR4_EPROTOTYPE, SVR4_ENOPROTOOPT, SVR4_EPROTONOSUPPORT, SVR4_ESOCKTNOSUPPORT, SVR4_EOPNOTSUPP, SVR4_EPFNOSUPPORT, SVR4_EAFNOSUPPORT, SVR4_EADDRINUSE, SVR4_EADDRNOTAVAIL, SVR4_ENETDOWN, SVR4_ENETUNREACH, SVR4_ENETRESET, SVR4_ECONNABORTED, SVR4_ECONNRESET, SVR4_ENOBUFS, SVR4_EISCONN, SVR4_ENOTCONN, SVR4_ESHUTDOWN, SVR4_ETOOMANYREFS, SVR4_ETIMEDOUT, SVR4_ECONNREFUSED, SVR4_ELOOP, SVR4_ENAMETOOLONG, SVR4_EHOSTDOWN, SVR4_EHOSTUNREACH, SVR4_ENOTEMPTY, SVR4_EPROCLIM, SVR4_EUSERS, SVR4_EDQUOT, SVR4_ESTALE, SVR4_EREMOTE, SVR4_EBADRPC, SVR4_ERPCMISMATCH, SVR4_EPROGUNAVAIL, SVR4_EPROGMISMATCH, SVR4_EPROCUNAVAIL, SVR4_ENOLCK, SVR4_ENOSYS, SVR4_EFTYPE, SVR4_EAUTH, SVR4_ENEEDAUTH, SVR4_EIDRM, SVR4_ENOMSG, }; static int svr4_fixup(register_t **stack_base, struct image_params *imgp); extern struct sysent svr4_sysent[]; #undef szsigcode #undef sigcode extern int svr4_szsigcode; extern char svr4_sigcode[]; struct sysentvec svr4_sysvec = { SVR4_SYS_MAXSYSCALL, svr4_sysent, 0xff, SVR4_SIGTBLSZ, bsd_to_svr4_sig, ELAST, /* ELAST */ bsd_to_svr4_errno, NULL, svr4_fixup, svr4_sendsig, svr4_sigcode, &svr4_szsigcode, NULL, "SVR4", elf32_coredump, NULL, SVR4_MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_setregs }; Elf32_Brandinfo svr4_brand = { ELFOSABI_SYSV, EM_386, /* XXX only implemented for x86 so far. */ "SVR4", svr4_emul_path, "/lib/libc.so.1", &svr4_sysvec }; const char svr4_emul_path[] = "/compat/svr4"; static int svr4_fixup(register_t **stack_base, struct image_params *imgp) { Elf32_Auxargs *args; register_t *pos; KASSERT(curthread->td_proc == imgp->proc && - (curthread->td_proc->p_flag & P_THREADED) == 0, + (curthread->td_proc->p_flag & P_SA) == 0, ("unsafe svr4_fixup(), should be curproc")); args = (Elf32_Auxargs *)imgp->auxargs; pos = *stack_base + (imgp->argc + imgp->envc + 2); if (args->trace) AUXARGS_ENTRY(pos, AT_DEBUG, 1); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; **stack_base = (register_t)imgp->argc; return 0; } /* * Search an alternate path before passing pathname arguments on * to system calls. Useful for keeping a separate 'emulation tree'. * * If cflag is set, we check if an attempt can be made to create * the named file, i.e. we check if the directory it should * be in exists. * * Code shamelessly stolen by Mark Newton from IBCS2 emulation code. */ int svr4_emul_find(td, sgp, prefix, path, pbuf, cflag) struct thread *td; caddr_t *sgp; /* Pointer to stackgap memory */ const char *prefix; char *path; char **pbuf; int cflag; { struct nameidata nd; struct nameidata ndroot; struct vattr vat; struct vattr vatroot; int error; char *ptr, *buf, *cp; size_t sz, len; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pbuf = path; for (ptr = buf; (*ptr = *prefix) != '\0'; ptr++, prefix++) continue; sz = MAXPATHLEN - (ptr - buf); /* * If sgp is not given then the path is already in kernel space */ if (sgp == NULL) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { free(buf, M_TEMP); return error; } if (*ptr != '/') { free(buf, M_TEMP); return EINVAL; } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (cflag) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); if ((error = namei(&nd)) != 0) { free(buf, M_TEMP); return error; } NDFREE(&nd, NDF_ONLY_PNBUF); *cp = '/'; } else { NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); if ((error = namei(&nd)) != 0) { free(buf, M_TEMP); return error; } NDFREE(&nd, NDF_ONLY_PNBUF); /* * We now compare the vnode of the svr4_root to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, svr4_emul_path, td); if ((error = namei(&ndroot)) != 0) { /* Cannot happen! */ free(buf, M_TEMP); vrele(nd.ni_vp); return error; } NDFREE(&ndroot, NDF_ONLY_PNBUF); if ((error = VOP_GETATTR(nd.ni_vp, &vat, td->td_ucred, td)) != 0) { goto done; } if ((error = VOP_GETATTR(ndroot.ni_vp, &vatroot, td->td_ucred, td)) != 0) { goto done; } if (vat.va_fsid == vatroot.va_fsid && vat.va_fileid == vatroot.va_fileid) { error = ENOENT; goto done; } } if (sgp == NULL) *pbuf = buf; else { sz = &ptr[len] - buf; *pbuf = stackgap_alloc(sgp, sz + 1); error = copyout(buf, *pbuf, sz); free(buf, M_TEMP); } done: vrele(nd.ni_vp); if (!cflag) vrele(ndroot.ni_vp); return error; } static int svr4_elf_modevent(module_t mod, int type, void *data) { int error; error = 0; switch(type) { case MOD_LOAD: if (elf32_insert_brand_entry(&svr4_brand) < 0) error = EINVAL; if (error) printf("cannot insert svr4 elf brand handler\n"); else if (bootverbose) printf("svr4 ELF exec handler installed\n"); break; case MOD_UNLOAD: /* Only allow the emulator to be removed if it isn't in use. */ if (elf32_brand_inuse(&svr4_brand) != 0) { error = EBUSY; } else if (elf32_remove_brand_entry(&svr4_brand) < 0) { error = EINVAL; } if (error) printf("Could not deinstall ELF interpreter entry (error %d)\n", error); else if (bootverbose) printf("svr4 ELF exec handler removed\n"); break; default: break; } return error; } static moduledata_t svr4_elf_mod = { "svr4elf", svr4_elf_modevent, 0 }; DECLARE_MODULE(svr4elf, svr4_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(svr4elf, streams, 1, 1, 1); Index: head/sys/ddb/db_ps.c =================================================================== --- head/sys/ddb/db_ps.c (revision 116360) +++ head/sys/ddb/db_ps.c (revision 116361) @@ -1,233 +1,233 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static void dumpthread(volatile struct proc *p, volatile struct thread *td); void db_ps(dummy1, dummy2, dummy3, dummy4) db_expr_t dummy1; boolean_t dummy2; db_expr_t dummy3; char * dummy4; { int np; int nl = 0; volatile struct proc *p, *pp; volatile struct thread *td; char *state; np = nprocs; /* sx_slock(&allproc_lock); */ if (!LIST_EMPTY(&allproc)) p = LIST_FIRST(&allproc); else p = &proc0; db_printf(" pid proc addr uid ppid pgrp flag stat wmesg wchan cmd\n"); while (--np >= 0) { /* * XXX just take 20 for now... */ if (nl++ >= 20) { int c; db_printf("--More--"); c = cngetc(); db_printf("\r"); /* * A whole screenfull or just one line? */ switch (c) { case '\n': /* just one line */ nl = 20; break; case ' ': nl = 0; /* another screenfull */ break; default: /* exit */ db_printf("\n"); return; } } if (p == NULL) { printf("oops, ran out of processes early!\n"); break; } /* PROC_LOCK(p); */ pp = p->p_pptr; if (pp == NULL) pp = p; switch(p->p_state) { case PRS_NORMAL: if (P_SHOULDSTOP(p)) state = "stop"; else state = ""; break; case PRS_NEW: state = "new "; break; case PRS_ZOMBIE: state = "zomb"; break; default: state = "Unkn"; break; } db_printf("%5d %8p %8p %4d %5d %5d %07x %s", p->p_pid, (volatile void *)p, (void *)p->p_uarea, p->p_ucred != NULL ? p->p_ucred->cr_ruid : 0, pp->p_pid, p->p_pgrp != NULL ? p->p_pgrp->pg_id : 0, p->p_flag, state); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) db_printf("(threaded) %s\n", p->p_comm); FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td); nl++; } /* PROC_UNLOCK(p); */ p = LIST_NEXT(p, p_list); if (p == NULL && np > 0) p = LIST_FIRST(&zombproc); } /* sx_sunlock(&allproc_lock); */ } static void dumpthread(volatile struct proc *p, volatile struct thread *td) { - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) db_printf( " thread %p ksegrp %p ", td, td->td_ksegrp); if (TD_ON_SLEEPQ(td)) { if (td->td_flags & TDF_CVWAITQ) if (TD_IS_SLEEPING(td)) db_printf("[CV]"); else db_printf("[CVQ"); else if (TD_IS_SLEEPING(td)) db_printf("[SLP]"); else db_printf("[SLPQ"); db_printf("%s %p]", td->td_wmesg, (void *)td->td_wchan); } switch (td->td_state) { case TDS_INHIBITED: if (TD_ON_LOCK(td)) { db_printf("[LOCK %6s %8p]", td->td_lockname, (void *)td->td_blocked); } #if 0 /* covered above */ if (TD_IS_SLEEPING(td)) { db_printf("[SLP]"); } #endif if (TD_IS_SWAPPED(td)) { db_printf("[SWAP]"); } if (TD_IS_SUSPENDED(td)) { db_printf("[SUSP]"); } if (TD_AWAITING_INTR(td)) { db_printf("[IWAIT]"); } break; case TDS_CAN_RUN: db_printf("[Can run]"); break; case TDS_RUNQ: db_printf("[RUNQ]"); break; case TDS_RUNNING: db_printf("[CPU %d]", td->td_oncpu); break; case TDS_INACTIVE: db_printf("[INACTIVE]"); break; default: db_printf("[UNK: %#x]", td->td_state); } - if (p->p_flag & P_THREADED) { + if (p->p_flag & P_SA) { if (td->td_kse) db_printf("[kse %p]", td->td_kse); db_printf("\n"); } else db_printf(" %s\n", p->p_comm); } #define INKERNEL(va) (((vm_offset_t)(va)) >= USRSTACK) void db_show_one_thread(db_expr_t addr, boolean_t have_addr, db_expr_t count, char *modif) { struct proc *p; struct thread *td; if (!have_addr) td = curthread; else if (!INKERNEL(addr)) { printf("bad thread address"); return; } else td = (struct thread *)addr; /* quick sanity check */ if ((p = td->td_proc) != td->td_ksegrp->kg_proc) return; printf("Proc %p ",p); dumpthread(p, td); #ifdef __i386__ db_stack_thread((db_expr_t)td, 1, count, modif); #endif } Index: head/sys/fs/procfs/procfs_status.c =================================================================== --- head/sys/fs/procfs/procfs_status.c (revision 116360) +++ head/sys/fs/procfs/procfs_status.c (revision 116361) @@ -1,214 +1,214 @@ /* * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 * * From: * $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int procfs_doprocstatus(PFS_FILL_ARGS) { struct session *sess; struct thread *tdfirst; struct tty *tp; struct ucred *cr; const char *wmesg; char *pc; char *sep; int pid, ppid, pgid, sid; int i; pid = p->p_pid; PROC_LOCK(p); ppid = p->p_pptr ? p->p_pptr->p_pid : 0; pgid = p->p_pgrp->pg_id; sess = p->p_pgrp->pg_session; SESS_LOCK(sess); sid = sess->s_leader ? sess->s_leader->p_pid : 0; /* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg euid ruid rgid,egid,groups[1 .. NGROUPS] */ pc = p->p_comm; do { if (*pc < 33 || *pc > 126 || *pc == '\\') sbuf_printf(sb, "\\%03o", *pc); else sbuf_putc(sb, *pc); } while (*++pc); sbuf_printf(sb, " %d %d %d %d ", pid, ppid, pgid, sid); if ((p->p_flag & P_CONTROLT) && (tp = sess->s_ttyp)) sbuf_printf(sb, "%d,%d ", major(tp->t_dev), minor(tp->t_dev)); else sbuf_printf(sb, "%d,%d ", -1, -1); sep = ""; if (sess->s_ttyvp) { sbuf_printf(sb, "%sctty", sep); sep = ","; } if (SESS_LEADER(p)) { sbuf_printf(sb, "%ssldr", sep); sep = ","; } SESS_UNLOCK(sess); if (*sep != ',') { sbuf_printf(sb, "noflags"); } mtx_lock_spin(&sched_lock); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) wmesg = "-kse- "; else { tdfirst = FIRST_THREAD_IN_PROC(p); if (tdfirst->td_wchan != NULL) { KASSERT(tdfirst->td_wmesg != NULL, ("wchan %p has no wmesg", tdfirst->td_wchan)); wmesg = tdfirst->td_wmesg; } else wmesg = "nochan"; } if (p->p_sflag & PS_INMEM) { struct timeval start, ut, st; calcru(p, &ut, &st, (struct timeval *) NULL); mtx_unlock_spin(&sched_lock); start = p->p_stats->p_start; timevaladd(&start, &boottime); sbuf_printf(sb, " %ld,%ld %ld,%ld %ld,%ld", start.tv_sec, start.tv_usec, ut.tv_sec, ut.tv_usec, st.tv_sec, st.tv_usec); } else { mtx_unlock_spin(&sched_lock); sbuf_printf(sb, " -1,-1 -1,-1 -1,-1"); } sbuf_printf(sb, " %s", wmesg); cr = p->p_ucred; sbuf_printf(sb, " %lu %lu %lu", (u_long)cr->cr_uid, (u_long)cr->cr_ruid, (u_long)cr->cr_rgid); /* egid (cr->cr_svgid) is equal to cr_ngroups[0] see also getegid(2) in /sys/kern/kern_prot.c */ for (i = 0; i < cr->cr_ngroups; i++) { sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]); } if (jailed(p->p_ucred)) { mtx_lock(&p->p_ucred->cr_prison->pr_mtx); sbuf_printf(sb, " %s", p->p_ucred->cr_prison->pr_host); mtx_unlock(&p->p_ucred->cr_prison->pr_mtx); } else { sbuf_printf(sb, " -"); } PROC_UNLOCK(p); sbuf_printf(sb, "\n"); return (0); } int procfs_doproccmdline(PFS_FILL_ARGS) { struct ps_strings pstr; int error, i; /* * If we are using the ps/cmdline caching, use that. Otherwise * revert back to the old way which only implements full cmdline * for the currept process and just p->p_comm for all other * processes. * Note that if the argv is no longer available, we deliberately * don't fall back on p->p_comm or return an error: the authentic * Linux behaviour is to return zero-length in this case. */ PROC_LOCK(p); if (p->p_args && (ps_argsopen || !p_cansee(td, p))) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); if (p != td->td_proc) { sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm); } else { error = copyin((void *)p->p_sysent->sv_psstrings, &pstr, sizeof(pstr)); if (error) return (error); for (i = 0; i < pstr.ps_nargvstr; i++) { sbuf_copyin(sb, pstr.ps_argvstr[i], 0); sbuf_printf(sb, "%c", '\0'); } } return (0); } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 116360) +++ head/sys/i386/i386/pmap.c (revision 116361) @@ -1,3397 +1,3397 @@ /* * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ #include __FBSDID("$FreeBSD$"); /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Manages physical address maps. * * In addition to hardware address maps, this * module is called upon to provide software-use-only * maps which may or may not be stored in the same * form as hardware maps. These pseudo-maps are * used to store intermediate results from copy * operations to and from address spaces. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" #include "opt_swtch.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) || defined(APIC_IO) #include #include #include #include #endif /* SMP || APIC_IO */ #define PMAP_KEEP_PDIRS #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC #endif #define MINPV 2048 #if !defined(PMAP_DIAGNOSTIC) #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) /* * Given a map and a machine independent protection code, * convert to a vax protection code. */ #define pte_prot(m, p) (protection_codes[p]) static int protection_codes[8]; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; #if defined(SMP) && defined(LAZY_SWITCH) static struct mtx lazypmap_lock; #endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ static int pgeflag; /* PG_G or-in */ static int pseflag; /* PG_PS or-in */ static int nkpt; vm_offset_t kernel_vm_end; extern u_int32_t KERNend; #ifdef PAE static uma_zone_t pdptzone; #endif /* * Data for the pv entry allocation mechanism */ static uma_zone_t pvzone; static struct vm_object pvzone_obj; static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; int pmap_pagedaemon_waken; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = 0; static pt_entry_t *CMAP2, *CMAP3, *ptmmap; caddr_t CADDR1 = 0, ptvmmap = 0; static caddr_t CADDR2, CADDR3; static struct mtx CMAPCADDR12_lock; static pt_entry_t *msgbufmap; struct msgbuf *msgbufp = 0; /* * Crashdump maps. */ static pt_entry_t *pt_crashdumpmap; static caddr_t crashdumpmap; #ifdef SMP extern pt_entry_t *SMPpt; #endif static pt_entry_t *PMAP1 = 0; static pt_entry_t *PADDR1 = 0; static PMAP_INLINE void free_pv_entry(pv_entry_t pv); static pv_entry_t get_pv_entry(void); static void i386_protection_init(void); static __inline void pmap_changebit(vm_page_t m, int bit, boolean_t setem); static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex); static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex); static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); static vm_offset_t pmap_kmem_choose(vm_offset_t addr); static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); #ifdef PAE static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); #endif static pd_entry_t pdir4mb; CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * Move the kernel virtual free pointer to the next * 4MB. This is used to help improve performance * by using a large (4MB) page for much of the kernel * (.text, .data, .bss) */ static vm_offset_t pmap_kmem_choose(vm_offset_t addr) { vm_offset_t newaddr = addr; #ifdef I686_CPU_not /* Problem seems to have gone away */ /* Deal with un-resolved Pentium4 issues */ if (cpu_class == CPUCLASS_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 && (cpu_id & 0xf00) == 0xf00) return newaddr; #endif #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); #endif return newaddr; } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(firstaddr, loadaddr) vm_paddr_t firstaddr; vm_paddr_t loadaddr; { vm_offset_t va; pt_entry_t *pte; int i; avail_start = firstaddr; /* * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too * large. It should instead be correctly calculated in locore.s and * not based on 'first' (which is a physical address, not a virtual * address, for the start of unused physical memory). The kernel * page tables are NOT double mapped and thus should not be included * in this calculation. */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_avail = pmap_kmem_choose(virtual_avail); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize protection array. */ i386_protection_init(); /* * Initialize the kernel pmap (which is statically allocated). */ kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #ifdef PAE kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); #if defined(SMP) && defined(LAZY_SWITCH) mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); #endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); nkpt = NKPT; /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the idle process page zeroing. */ SYSMAP(caddr_t, CMAP1, CADDR1, 1) SYSMAP(caddr_t, CMAP2, CADDR2, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1) mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF); /* * Crashdump maps. */ SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. * XXX ptmmap is not used. */ SYSMAP(caddr_t, ptmmap, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. * XXX msgbufmap is not used. */ SYSMAP(struct msgbuf *, msgbufmap, msgbufp, atop(round_page(MSGBUF_SIZE))) /* * ptemap is used for pmap_pte_quick */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); virtual_avail = va; *CMAP1 = *CMAP2 = 0; for (i = 0; i < NKPT; i++) PTD[i] = 0; pgeflag = 0; #ifndef DISABLE_PG_G if (cpu_feature & CPUID_PGE) pgeflag = PG_G; #endif #ifdef I686_CPU_not /* Problem seems to have gone away */ /* Deal with un-resolved Pentium4 issues */ if (cpu_class == CPUCLASS_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 && (cpu_id & 0xf00) == 0xf00) { printf("Warning: Pentium 4 cpu: PG_G disabled (global flag)\n"); pgeflag = 0; } #endif /* * Initialize the 4MB page size flag */ pseflag = 0; /* * The 4MB page version of the initial * kernel page mapping. */ pdir4mb = 0; #ifndef DISABLE_PSE if (cpu_feature & CPUID_PSE) pseflag = PG_PS; #endif #ifdef I686_CPU_not /* Problem seems to have gone away */ /* Deal with un-resolved Pentium4 issues */ if (cpu_class == CPUCLASS_686 && strcmp(cpu_vendor, "GenuineIntel") == 0 && (cpu_id & 0xf00) == 0xf00) { printf("Warning: Pentium 4 cpu: PG_PS disabled (4MB pages)\n"); pseflag = 0; } #endif #ifndef DISABLE_PSE if (pseflag) { pd_entry_t ptditmp; /* * Note that we have enabled PSE mode */ ptditmp = *(PTmap + i386_btop(KERNBASE)); ptditmp &= ~(NBPDR - 1); ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; pdir4mb = ptditmp; } #endif #ifndef SMP /* * Turn on PGE/PSE. SMP does this later on since the * 4K page tables are required for AP boot (for now). * XXX fixme. */ pmap_set_opt(); #endif #ifdef SMP if (cpu_apic_address == 0) panic("pmap_bootstrap: no local apic! (non-SMP hardware?)"); /* local apic is mapped on last page */ SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag | (cpu_apic_address & PG_FRAME)); #endif invltlb(); } /* * Enable 4MB page mode for MP startup. Turn on PG_G support. * BSP will run this after all the AP's have started up. */ void pmap_set_opt(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag && (cpu_feature & CPUID_PGE)) { load_cr4(rcr4() | CR4_PGE); invltlb(); /* Insurance */ } #ifndef DISABLE_PSE if (pseflag && (cpu_feature & CPUID_PSE)) { load_cr4(rcr4() | CR4_PSE); invltlb(); /* Insurance */ } #endif if (PCPU_GET(cpuid) == 0) { #ifndef DISABLE_PSE if (pdir4mb) { kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb; invltlb(); /* Insurance */ } #endif if (pgeflag) { /* Turn on PG_G for text, data, bss pages. */ va = (vm_offset_t)btext; #ifndef DISABLE_PSE if (pseflag && (cpu_feature & CPUID_PSE)) { if (va < KERNBASE + (1 << PDRSHIFT)) va = KERNBASE + (1 << PDRSHIFT); } #endif endva = KERNBASE + KERNend; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; va += PAGE_SIZE; } invltlb(); /* Insurance */ } /* * We do not need to broadcast the invltlb here, because * each AP does it the moment it is released from the boot * lock. See ap_init(). */ } } static void * pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { *flags = UMA_SLAB_PRIV; return (void *)kmem_alloc(kernel_map, bytes); } #ifdef PAE static void * pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) { *flags = UMA_SLAB_PRIV; return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0)); } #endif /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. * pmap_init has been enhanced to support in a fairly consistant * way, discontiguous physical memory. */ void pmap_init(phys_start, phys_end) vm_paddr_t phys_start, phys_end; { int i; int initial_pvs; /* * Allocate memory for random pmap data structures. Includes the * pv_head_table. */ for(i = 0; i < vm_page_array_size; i++) { vm_page_t m; m = &vm_page_array[i]; TAILQ_INIT(&m->md.pv_list); m->md.pv_list_count = 0; } /* * init the pv free list */ initial_pvs = vm_page_array_size; if (initial_pvs < MINPV) initial_pvs = MINPV; pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM); uma_zone_set_allocf(pvzone, pmap_pv_allocf); uma_prealloc(pvzone, initial_pvs); #ifdef PAE pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 0); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif /* * Now it is safe to enable pv_table recording. */ pmap_initialized = TRUE; } /* * Initialize the address space (zone) for the pv_entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ void pmap_init2() { int shpgperproc = PMAP_SHPGPERPROC; TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_page_array_size; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_high_water = 9 * (pv_entry_max / 10); uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); } /*************************************************** * Low level helper routines..... ***************************************************/ #if defined(PMAP_DIAGNOSTIC) /* * This code checks for non-writeable/modified pages. * This should be an invalid condition. */ static int pmap_nw_modified(pt_entry_t ptea) { int pte; pte = (int) ptea; if ((pte & (PG_M|PG_RW)) == PG_M) return 1; else return 0; } #endif /* * this routine defines the region(s) of memory that should * not be tested for the modified bit. */ static PMAP_INLINE int pmap_track_modified(vm_offset_t va) { if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) return 1; else return 0; } #ifdef I386_CPU /* * i386 only has "invalidate everything" and no SMP to worry about. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } #else /* !I386_CPU */ #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { u_int cpumask; u_int other_cpus; critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active */ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { invlpg(va); smp_invlpg(va); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invlpg(va); if (pmap->pm_active & other_cpus) smp_masked_invlpg(pmap->pm_active & other_cpus, va); } critical_exit(); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { u_int cpumask; u_int other_cpus; vm_offset_t addr; critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active */ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); smp_invlpg_range(sva, eva); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap->pm_active & other_cpus) smp_masked_invlpg_range(pmap->pm_active & other_cpus, sva, eva); } critical_exit(); } void pmap_invalidate_all(pmap_t pmap) { u_int cpumask; u_int other_cpus; #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif critical_enter(); /* * We need to disable interrupt preemption but MUST NOT have * interrupts disabled here. * XXX we may need to hold schedlock to get a coherent pm_active */ if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) { invltlb(); smp_invltlb(); } else { cpumask = PCPU_GET(cpumask); other_cpus = PCPU_GET(other_cpus); if (pmap->pm_active & cpumask) invltlb(); if (pmap->pm_active & other_cpus) smp_masked_invltlb(pmap->pm_active & other_cpus); } critical_exit(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || pmap->pm_active) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || pmap->pm_active) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap || pmap->pm_active) invltlb(); } #endif /* !SMP */ #endif /* !I386_CPU */ /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. */ pt_entry_t * pmap_pte_quick(pmap, va) register pmap_t pmap; vm_offset_t va; { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return vtopte(va); newpf = *pde & PG_FRAME; if (((*PMAP1) & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1); } return PADDR1 + (i386_btop(va) & (NPTEPG - 1)); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap, va) register pmap_t pmap; vm_offset_t va; { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; if (pmap == 0) return 0; pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) { rtval = (pde & ~PDRMASK) | (va & PDRMASK); return rtval; } pte = pmap_pte_quick(pmap, va); rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); return rtval; } return 0; } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; va = sva = *virt; while (start < end) { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); va += PAGE_SIZE; m++; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; retry: m = vm_page_lookup(object, pindex); if (m != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(m, FALSE, "pplookp")) goto retry; vm_page_unlock_queues(); } return m; } /*************************************************** * Page table page management routines..... ***************************************************/ /* * This routine unholds page table pages, and if the hold count * drops to zero, then it decrements the wire count. */ static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) vm_page_lock_queues(); if (m->hold_count == 0) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; if (pmap_is_current(pmap)) { /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); } /* * If the page is finally unwired, simply free it. */ --m->wire_count; if (m->wire_count == 0) { vm_page_busy(m); vm_page_free_zero(m); atomic_subtract_int(&cnt.v_wire_count, 1); } return 1; } return 0; } static PMAP_INLINE int pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) { vm_page_unhold(m); if (m->hold_count == 0) return _pmap_unwire_pte_hold(pmap, m); else return 0; } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) { unsigned ptepindex; if (va >= VM_MAXUSER_ADDRESS) return 0; if (mpte == NULL) { ptepindex = (va >> PDRSHIFT); if (pmap->pm_pteobj->root && (pmap->pm_pteobj->root->pindex == ptepindex)) { mpte = pmap->pm_pteobj->root; } else { while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL && vm_page_sleep_if_busy(mpte, FALSE, "pulook")) vm_page_lock_queues(); } } return pmap_unwire_pte_hold(pmap, mpte); } void pmap_pinit0(pmap) struct pmap *pmap; { pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #ifdef PAE pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ void pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map, NBPTD); #ifdef PAE pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif } /* * allocate object for the ptes */ if (pmap->pm_pteobj == NULL) pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + NPGPTD); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD; i++) { ptdpg[i] = vm_page_grab(pmap->pm_pteobj, PTDPTDI + i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); vm_page_lock_queues(); vm_page_flag_clear(ptdpg[i], PG_BUSY); ptdpg[i]->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) { if ((ptdpg[i]->flags & PG_ZERO) == 0) bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); } mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* Wire in kernel global address entries. */ /* XXX copies current process, does not fill in MPPTDI */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); #ifdef SMP pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; #endif /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #ifdef PAE pmap->pm_pdpt[i] = pa | PG_V; #endif } pmap->pm_active = 0; TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Wire in kernel global address entries. To avoid a race condition * between pmap initialization and pmap_growkernel, this procedure * should be called after the vmspace is attached to the process * but before this pmap is activated. */ void pmap_pinit2(pmap) struct pmap *pmap; { /* XXX: Remove this stub when no longer called */ } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap, ptepindex) pmap_t pmap; unsigned ptepindex; { vm_paddr_t ptepa; vm_offset_t pteva; vm_page_t m; /* * Find or fabricate a new pagetable page */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); /* * Increment the hold count for the page table page * (denoting a new mapping.) */ m->hold_count++; /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); /* * Try to use the new mapping, but if we cannot, then * do it with the routine that maps the page explicitly. */ if ((m->flags & PG_ZERO) == 0) { if (pmap_is_current(pmap)) { pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex); bzero((caddr_t) pteva, PAGE_SIZE); } else { pmap_zero_page(m); } } vm_page_lock_queues(); m->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(m, PG_ZERO); vm_page_wakeup(m); vm_page_unlock_queues(); return m; } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va) { unsigned ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { pmap->pm_pdir[ptepindex] = 0; ptepa = 0; pmap_invalidate_all(kernel_pmap); } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { /* * In order to get the page table page, try the * hint first. */ if (pmap->pm_pteobj->root && (pmap->pm_pteobj->root->pindex == ptepindex)) { m = pmap->pm_pteobj->root; } else { m = pmap_page_lookup(pmap->pm_pteobj, ptepindex); } m->hold_count++; return m; } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ return _pmap_allocpte(pmap, ptepindex); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ #ifdef LAZY_SWITCH #ifdef SMP /* * Deal with a SMP shootdown of other users of the pmap that we are * trying to dispose of. This can be a bit hairy. */ static u_int *lazymask; static u_int lazyptd; static volatile u_int lazywait; void pmap_lazyfix_action(void); void pmap_lazyfix_action(void) { u_int mymask = PCPU_GET(cpumask); if (rcr3() == lazyptd) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); #ifdef SWTCH_OPTIM_STATS atomic_add_int(&lazy_flush_smpfixup, 1); } else { if (*lazymask & mymask) lazy_flush_smpbadcr3++; else lazy_flush_smpmiss++; #endif } atomic_clear_int(lazymask, mymask); atomic_store_rel_int(&lazywait, 1); } static void pmap_lazyfix_self(u_int mymask) { if (rcr3() == lazyptd) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); #ifdef SWTCH_OPTIM_STATS lazy_flush_fixup++; } else { if (*lazymask & mymask) lazy_flush_smpbadcr3++; else lazy_flush_smpmiss++; #endif } atomic_clear_int(lazymask, mymask); } static void pmap_lazyfix(pmap_t pmap) { u_int mymask = PCPU_GET(cpumask); u_int mask; register u_int spins; while ((mask = pmap->pm_active) != 0) { spins = 50000000; mask = mask & -mask; /* Find least significant set bit */ mtx_lock_spin(&lazypmap_lock); #ifdef PAE lazyptd = vtophys(pmap->pm_pdpt); #else lazyptd = vtophys(pmap->pm_pdir); #endif if (mask == mymask) { lazymask = &pmap->pm_active; pmap_lazyfix_self(mymask); } else { atomic_store_rel_int((u_int *)&lazymask, (u_int)&pmap->pm_active); atomic_store_rel_int(&lazywait, 0); ipi_selected(mask, IPI_LAZYPMAP); while (lazywait == 0) { ia32_pause(); if (--spins == 0) break; } #ifdef SWTCH_OPTIM_STATS lazy_flush_smpipi++; #endif } mtx_unlock_spin(&lazypmap_lock); if (spins == 0) printf("pmap_lazyfix: spun for 50000000\n"); } } #else /* SMP */ /* * Cleaning up on uniprocessor is easy. For various reasons, we're * unlikely to have to even execute this code, including the fact * that the cleanup is deferred until the parent does a wait(2), which * means that another userland process has run. */ static void pmap_lazyfix(pmap_t pmap) { u_int cr3; cr3 = vtophys(pmap->pm_pdir); if (cr3 == rcr3()) { load_cr3(PCPU_GET(curpcb)->pcb_cr3); pmap->pm_active &= ~(PCPU_GET(cpumask)); #ifdef SWTCH_OPTIM_STATS lazy_flush_fixup++; #endif } } #endif /* SMP */ #endif /* LAZY_SWITCH */ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_object_t object; vm_page_t m; int i; object = pmap->pm_pteobj; KASSERT(object->ref_count == 1, ("pmap_release: pteobj reference count %d != 1", object->ref_count)); KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); #ifdef LAZY_SWITCH pmap_lazyfix(pmap); #endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); #ifdef SMP pmap->pm_pdir[MPPTDI] = 0; #endif pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); vm_page_lock_queues(); for (i = 0; i < NPGPTD; i++) { m = TAILQ_FIRST(&object->memq); #ifdef PAE KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&cnt.v_wire_count, 1); vm_page_busy(m); vm_page_free_zero(m); } KASSERT(TAILQ_EMPTY(&object->memq), ("pmap_release: leaking page table pages")); vm_page_unlock_queues(); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { struct pmap *pmap; int s; vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; pt_entry_t *pde; s = splhigh(); mtx_assert(&kernel_map->system_mtx, MA_OWNED); if (kernel_vm_end == 0) { kernel_vm_end = KERNBASE; nkpt = 0; while (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); nkpt++; } } addr = roundup2(addr, PAGE_SIZE * NPTEPG); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); continue; } /* * This index is bogus, but out of the way */ nkpg = vm_page_alloc(NULL, nkpt, VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); if (!nkpg) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(PTD, kernel_vm_end) = newpdir; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, kernel_vm_end); pde_store(pde, newpdir); } mtx_unlock_spin(&allpmaps_lock); kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); } splx(s); } /*************************************************** * page management routines. ***************************************************/ /* * free the pv_entry back to the free list */ static PMAP_INLINE void free_pv_entry(pv_entry_t pv) { pv_entry_count--; uma_zfree(pvzone, pv); } /* * get a new pv_entry, allocating a block from the system * when needed. * the memory allocation is performed bypassing the malloc code * because of the possibility of allocations at interrupt time. */ static pv_entry_t get_pv_entry(void) { pv_entry_count++; if (pv_entry_high_water && (pv_entry_count > pv_entry_high_water) && (pmap_pagedaemon_waken == 0)) { pmap_pagedaemon_waken = 1; wakeup (&vm_pages_needed); } return uma_zalloc(pvzone, M_NOWAIT); } /* * If it is the first entry on the list, it is actually * in the header and we must copy the following entry up * to the header. Otherwise we must search the list for * the entry. In either case we free the now unused entry. */ static int pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { pv_entry_t pv; int rtval; int s; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if (m->md.pv_list_count < pmap->pm_stats.resident_count) { TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pmap == pv->pv_pmap && va == pv->pv_va) break; } } else { TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { if (va == pv->pv_va) break; } } rtval = 0; if (pv) { rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; if (TAILQ_FIRST(&m->md.pv_list) == NULL) vm_page_flag_clear(m, PG_WRITEABLE); TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); free_pv_entry(pv); } splx(s); return rtval; } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) { int s; pv_entry_t pv; s = splvm(); pv = get_pv_entry(); pv->pv_va = va; pv->pv_pmap = pmap; pv->pv_ptem = mpte; TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); m->md.pv_list_count++; splx(s); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) { pt_entry_t oldpte; vm_page_t m; oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte); if (oldpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) oldpte)) { printf( "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", va, oldpte); } #endif if (pmap_track_modified(va)) vm_page_dirty(m); } if (oldpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); return pmap_remove_entry(pmap, m, va); } else { return pmap_unuse_pt(pmap, va, NULL); } return 0; } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte; if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; int anyvalid; if (pmap == NULL) return; if (pmap->pm_stats.resident_count == 0) return; /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva); return; } anyvalid = 0; for (; sva < eva; sva = pdnxt) { unsigned pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { pmap->pm_pdir[pdirindex] = 0; pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anyvalid = 1; continue; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { if ((pte = pmap_pte_quick(pmap, sva)) == NULL || *pte == 0) continue; anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva)) break; } } if (anyvalid) pmap_invalidate_all(pmap); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { register pv_entry_t pv; pt_entry_t *pte, tpte; int s; #if defined(PMAP_DIAGNOSTIC) /* * XXX This makes pmap_remove_all() illegal for non-managed pages! */ if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m)); } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pv->pv_pmap->pm_stats.resident_count--; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pv->pv_pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_flag_set(m, PG_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) tpte)) { printf( "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", pv->pv_va, tpte); } #endif if (pmap_track_modified(pv->pv_va)) vm_page_dirty(m); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); m->md.pv_list_count--; pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; int anychanged; if (pmap == NULL) return; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if (prot & VM_PROT_WRITE) return; anychanged = 0; for (; sva < eva; sva = pdnxt) { unsigned pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; anychanged = 1; continue; } if (pdnxt > eva) pdnxt = eva; for (; sva != pdnxt; sva += PAGE_SIZE) { pt_entry_t pbits; pt_entry_t *pte; vm_page_t m; if ((pte = pmap_pte_quick(pmap, sva)) == NULL) continue; pbits = *pte; if (pbits & PG_MANAGED) { m = NULL; if (pbits & PG_A) { m = PHYS_TO_VM_PAGE(pbits); vm_page_flag_set(m, PG_REFERENCED); pbits &= ~PG_A; } if ((pbits & PG_M) != 0 && pmap_track_modified(sva)) { if (m == NULL) m = PHYS_TO_VM_PAGE(pbits); vm_page_dirty(m); pbits &= ~PG_M; } } pbits &= ~PG_RW; if (pbits != *pte) { pte_store(pte, pbits); anychanged = 1; } } } if (anychanged) pmap_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, boolean_t wired) { vm_paddr_t pa; register pt_entry_t *pte; vm_paddr_t opa; pt_entry_t origpte, newpte; vm_page_t mpte; if (pmap == NULL) return; va &= PG_FRAME; #ifdef PMAP_DIAGNOSTIC if (va > VM_MAX_KERNEL_ADDRESS) panic("pmap_enter: toobig"); if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); #endif mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte = pmap_allocpte(pmap, va); } #if 0 && defined(PMAP_DIAGNOSTIC) else { pd_entry_t *pdeaddr = pmap_pde(pmap, va); origpte = *pdeaddr; if ((origpte & PG_V) == 0) { panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", pmap->pm_pdir[PTDPTDI], origpte, va); } } #endif pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry not valid, we need a new PT page */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; origpte = *pte; opa = origpte & PG_FRAME; if (origpte & PG_PS) panic("pmap_enter: attempted pmap_enter on 4MB page"); /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; #if defined(PMAP_DIAGNOSTIC) if (pmap_nw_modified((pt_entry_t) origpte)) { printf( "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", va, origpte); } #endif /* * Remove extra pte reference */ if (mpte) mpte->hold_count--; if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) { if ((origpte & PG_RW) == 0) { pte_store(pte, origpte | PG_RW); pmap_invalidate_page(pmap, va); } return; } /* * We might be turning off write access to the page, * so we go ahead and sense modify status. */ if (origpte & PG_MANAGED) { if ((origpte & PG_M) && pmap_track_modified(va)) { vm_page_t om; om = PHYS_TO_VM_PAGE(opa); vm_page_dirty(om); } pa |= PG_MANAGED; } goto validate; } /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { int err; vm_page_lock_queues(); err = pmap_remove_pte(pmap, pte, va); vm_page_unlock_queues(); if (err) panic("pmap_enter: pte vanished, va: 0x%x", va); } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if (pmap_initialized && (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { pmap_insert_entry(pmap, va, mpte, m); pa |= PG_MANAGED; } /* * Increment counters */ pmap->pm_stats.resident_count++; if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V); if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { pte_store(pte, newpte | PG_A); /*if (origpte)*/ { pmap_invalidate_page(pmap, va); } } } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * 5. Tlbflush is deferred to calling procedure. * 6. Page IS managed. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { unsigned ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->hold_count++; } else { retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) panic("pmap_enter_quick: unexpected mapping into 4MB page"); if (pmap->pm_pteobj->root && (pmap->pm_pteobj->root->pindex == ptepindex)) { mpte = pmap->pm_pteobj->root; } else { mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); } if (mpte == NULL) goto retry; mpte->hold_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { vm_page_lock_queues(); pmap_unwire_pte_hold(pmap, mpte); vm_page_unlock_queues(); } return 0; } /* * Enter on the PV list if part of our managed memory. Note that we * raise IPL while manipulating pv_table since pmap_enter can be * called at interrupt time. */ if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) pmap_insert_entry(pmap, va, mpte, m); /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m); /* * Now validate mapping with RO protection */ if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return mpte; } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_offset_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); #ifndef I386_CPU invlpg(va); #else invltlb(); #endif return ((void *)crashdumpmap); } #define MAX_INIT_PT (96) /* * pmap_object_init_pt preloads the ptes for a given object * into the specified pmap. This eliminates the blast of soft * faults on process startup and immediately after an mmap. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size, int limit) { vm_offset_t tmpidx; int psize; vm_page_t p, mpte; if (pmap == NULL || object == NULL) return; VM_OBJECT_LOCK(object); /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ if (pseflag && (object->type == OBJT_DEVICE) && ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { int i; vm_page_t m[1]; unsigned int ptepindex; int npdes; pd_entry_t ptepa; if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) goto unlock_return; retry: p = vm_page_lookup(object, pindex); if (p != NULL) { vm_page_lock_queues(); if (vm_page_sleep_if_busy(p, FALSE, "init4p")) goto retry; } else { p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); if (p == NULL) goto unlock_return; m[0] = p; if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { vm_page_lock_queues(); vm_page_free(p); vm_page_unlock_queues(); goto unlock_return; } p = vm_page_lookup(object, pindex); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) { goto unlock_return; } p->valid = VM_PAGE_BITS_ALL; pmap->pm_stats.resident_count += size >> PAGE_SHIFT; npdes = size >> PDRSHIFT; for(i = 0; i < npdes; i++) { pde_store(&pmap->pm_pdir[ptepindex], ptepa | PG_U | PG_RW | PG_V | PG_PS); ptepa += NBPDR; ptepindex += 1; } pmap_invalidate_all(kernel_pmap); goto unlock_return; } psize = i386_btop(size); if ((object->type != OBJT_VNODE) || ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && (object->resident_page_count > MAX_INIT_PT))) { goto unlock_return; } if (psize + pindex > object->size) { if (object->size < pindex) goto unlock_return; psize = object->size - pindex; } mpte = NULL; if ((p = TAILQ_FIRST(&object->memq)) != NULL) { if (p->pindex < pindex) { p = vm_page_splay(pindex, object->root); if ((object->root = p)->pindex < pindex) p = TAILQ_NEXT(p, listq); } } /* * Assert: the variable p is either (1) the page with the * least pindex greater than or equal to the parameter pindex * or (2) NULL. */ for (; p != NULL && (tmpidx = p->pindex - pindex) < psize; p = TAILQ_NEXT(p, listq)) { /* * don't allow an madvise to blow away our really * free pages allocating pv entries. */ if ((limit & MAP_PREFAULT_MADVISE) && cnt.v_free_count < cnt.v_free_reserved) { break; } vm_page_lock_queues(); if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL && (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((p->queue - p->pc) == PQ_CACHE) vm_page_deactivate(p); vm_page_busy(p); vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); mpte = pmap_enter_quick(pmap, addr + i386_ptob(tmpidx), p, mpte); VM_OBJECT_LOCK(object); vm_page_lock_queues(); vm_page_wakeup(p); } vm_page_unlock_queues(); } unlock_return: VM_OBJECT_UNLOCK(object); } /* * pmap_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of pmap_object_init_pt, except it runs at page fault time instead * of mmap time. */ #define PFBAK 4 #define PFFOR 4 #define PAGEORDER_SIZE (PFBAK+PFFOR) static int pmap_prefault_pageorder[] = { -1 * PAGE_SIZE, 1 * PAGE_SIZE, -2 * PAGE_SIZE, 2 * PAGE_SIZE, -3 * PAGE_SIZE, 3 * PAGE_SIZE, -4 * PAGE_SIZE, 4 * PAGE_SIZE }; void pmap_prefault(pmap, addra, entry) pmap_t pmap; vm_offset_t addra; vm_map_entry_t entry; { int i; vm_offset_t starta; vm_offset_t addr; vm_pindex_t pindex; vm_page_t m, mpte; vm_object_t object; if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) return; object = entry->object.vm_object; starta = addra - PFBAK * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } mpte = NULL; for (i = 0; i < PAGEORDER_SIZE; i++) { vm_object_t lobject; pt_entry_t *pte; addr = addra + pmap_prefault_pageorder[i]; if (addr > addra + (PFFOR * PAGE_SIZE)) addr = 0; if (addr < starta || addr >= entry->end) continue; if ((*pmap_pde(pmap, addr)) == 0) continue; pte = vtopte(addr); if (*pte) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = object; for (m = vm_page_lookup(lobject, pindex); (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object)); lobject = lobject->backing_object) { if (lobject->backing_object_offset & PAGE_MASK) break; pindex += (lobject->backing_object_offset >> PAGE_SHIFT); m = vm_page_lookup(lobject->backing_object, pindex); } /* * give-up when a page is not in memory */ if (m == NULL) break; vm_page_lock_queues(); if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && (m->busy == 0) && (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { if ((m->queue - m->pc) == PQ_CACHE) { vm_page_deactivate(m); } vm_page_busy(m); vm_page_unlock_queues(); mpte = pmap_enter_quick(pmap, addr, m, mpte); vm_page_lock_queues(); vm_page_wakeup(m); } vm_page_unlock_queues(); } } /* * Routine: pmap_change_wiring * Function: Change the wiring attribute for a map/virtual-address * pair. * In/out conditions: * The mapping must already exist in the pmap. */ void pmap_change_wiring(pmap, va, wired) register pmap_t pmap; vm_offset_t va; boolean_t wired; { register pt_entry_t *pte; if (pmap == NULL) return; pte = pmap_pte_quick(pmap, va); if (wired && !pmap_pte_w(pte)) pmap->pm_stats.wired_count++; else if (!wired && pmap_pte_w(pte)) pmap->pm_stats.wired_count--; /* * Wiring is not a hardware characteristic so there is no need to * invalidate TLB. */ pmap_pte_set_w(pte, wired); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; vm_page_t m; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; unsigned ptepindex; if (addr >= UPT_MIN_ADDRESS) panic("pmap_copy: invalid to pmap_copy page tables\n"); /* * Don't let optional prefaulting of pages make us go * way below the low water mark of free pages or way * above high water mark of used pv entries. */ if (cnt.v_free_count < cnt.v_free_reserved || pv_entry_count > pv_entry_high_water) break; pdnxt = (addr + NBPDR) & ~PDRMASK; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if (dst_pmap->pm_pdir[ptepindex] == 0) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; } continue; } srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY)) continue; if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { /* * We have to check after allocpte for the * pte still being around... allocpte can * block. */ dstmpte = pmap_allocpte(dst_pmap, addr); dst_pte = pmap_pte_quick(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* * Clear the modified and * accessed (referenced) bits * during the copy. */ m = PHYS_TO_VM_PAGE(ptetemp); *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); } else { vm_page_lock_queues(); pmap_unwire_pte_hold(dst_pmap, dstmpte); vm_page_unlock_queues(); } if (dstmpte->hold_count >= srcmpte->hold_count) break; } addr += PAGE_SIZE; src_pte++; } } } #ifdef SMP /* * pmap_zpi_switchin*() * * These functions allow us to avoid doing IPIs alltogether in certain * temporary page-mapping situations (page zeroing). Instead to deal * with being preempted and moved onto a different cpu we invalidate * the page when the scheduler switches us in. This does not occur * very often so we remain relatively optimal with very little effort. */ static void pmap_zpi_switchin12(void) { invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); } static void pmap_zpi_switchin2(void) { invlpg((u_int)CADDR2); } static void pmap_zpi_switchin3(void) { invlpg((u_int)CADDR3); } #endif /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else #ifdef SMP curthread->td_switchin = pmap_zpi_switchin2; #endif invlpg((u_int)CADDR2); #endif #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR2); else #endif bzero(CADDR2, PAGE_SIZE); #ifdef SMP curthread->td_switchin = NULL; #endif *CMAP2 = 0; mtx_unlock(&CMAPCADDR12_lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP2) panic("pmap_zero_page: CMAP2 busy"); *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else #ifdef SMP curthread->td_switchin = pmap_zpi_switchin2; #endif invlpg((u_int)CADDR2); #endif #if defined(I686_CPU) if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE) i686_pagezero(CADDR2); else #endif bzero((char *)CADDR2 + off, size); #ifdef SMP curthread->td_switchin = NULL; #endif *CMAP2 = 0; mtx_unlock(&CMAPCADDR12_lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (*CMAP3) panic("pmap_zero_page: CMAP3 busy"); *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else #ifdef SMP curthread->td_switchin = pmap_zpi_switchin3; #endif invlpg((u_int)CADDR3); #endif #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) i686_pagezero(CADDR3); else #endif bzero(CADDR3, PAGE_SIZE); #ifdef SMP curthread->td_switchin = NULL; #endif *CMAP3 = 0; } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { mtx_lock(&CMAPCADDR12_lock); if (*CMAP1) panic("pmap_copy_page: CMAP1 busy"); if (*CMAP2) panic("pmap_copy_page: CMAP2 busy"); *CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; #ifdef I386_CPU invltlb(); #else #ifdef SMP curthread->td_switchin = pmap_zpi_switchin12; #endif invlpg((u_int)CADDR1); invlpg((u_int)CADDR2); #endif bcopy(CADDR1, CADDR2, PAGE_SIZE); #ifdef SMP curthread->td_switchin = NULL; #endif *CMAP1 = 0; *CMAP2 = 0; mtx_unlock(&CMAPCADDR12_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap, m) pmap_t pmap; vm_page_t m; { pv_entry_t pv; int loops = 0; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { if (pv->pv_pmap == pmap) { splx(s); return TRUE; } loops++; if (loops >= 16) break; } splx(s); return (FALSE); } #define PMAP_REMOVE_PAGES_CURPROC_ONLY /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap, sva, eva) pmap_t pmap; vm_offset_t sva, eva; { pt_entry_t *pte, tpte; vm_page_t m; pv_entry_t pv, npv; int s; #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } #endif mtx_assert(&vm_page_queue_mtx, MA_OWNED); s = splvm(); for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { if (pv->pv_va >= eva || pv->pv_va < sva) { npv = TAILQ_NEXT(pv, pv_plist); continue; } #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY pte = vtopte(pv->pv_va); #else pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); #endif tpte = *pte; if (tpte == 0) { printf("TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { npv = TAILQ_NEXT(pv, pv_plist); continue; } m = PHYS_TO_VM_PAGE(tpte); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT(m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pv->pv_pmap->pm_stats.resident_count--; pte_clear(pte); /* * Update the vm_page_t clean and reference bits. */ if (tpte & PG_M) { vm_page_dirty(m); } npv = TAILQ_NEXT(pv, pv_plist); TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); m->md.pv_list_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); if (TAILQ_FIRST(&m->md.pv_list) == NULL) { vm_page_flag_clear(m, PG_WRITEABLE); } pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); free_pv_entry(pv); } splx(s); pmap_invalidate_all(pmap); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { pv_entry_t pv; pt_entry_t *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return FALSE; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * if the bit being tested is the modified bit, then * mark clean_map and ptes as never * modified. */ if (!pmap_track_modified(pv->pv_va)) continue; #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (*pte & PG_M) { splx(s); return TRUE; } } splx(s); return (FALSE); } /* * this routine is used to modify bits in ptes */ static __inline void pmap_changebit(vm_page_t m, int bit, boolean_t setem) { register pv_entry_t pv; register pt_entry_t *pte; int s; if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) return; s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); /* * Loop over all current mappings setting/clearing as appropos If * setting RO do we need to clear the VAC? */ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { /* * don't write protect pager mappings */ if (!setem && (bit == PG_RW)) { if (!pmap_track_modified(pv->pv_va)) continue; } #if defined(PMAP_DIAGNOSTIC) if (!pv->pv_pmap) { printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); continue; } #endif pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (setem) { *pte |= bit; pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } else { pt_entry_t pbits = *pte; if (pbits & bit) { if (bit == PG_RW) { if (pbits & PG_M) { vm_page_dirty(m); } pte_store(pte, pbits & ~(PG_M|PG_RW)); } else { pte_store(pte, pbits & ~bit); } pmap_invalidate_page(pv->pv_pmap, pv->pv_va); } } } if (!setem && bit == PG_RW) vm_page_flag_clear(m, PG_WRITEABLE); splx(s); } /* * pmap_page_protect: * * Lower the permission for all mappings to a given page. */ void pmap_page_protect(vm_page_t m, vm_prot_t prot) { if ((prot & VM_PROT_WRITE) == 0) { if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { pmap_changebit(m, PG_RW, FALSE); } else { pmap_remove_all(m); } } } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { register pv_entry_t pv, pvf, pvn; pt_entry_t *pte; pt_entry_t v; int s; int rtval = 0; if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) return (rtval); s = splvm(); mtx_assert(&vm_page_queue_mtx, MA_OWNED); if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pvf = pv; do { pvn = TAILQ_NEXT(pv, pv_list); TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); if (!pmap_track_modified(pv->pv_va)) continue; pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); if (pte && ((v = pte_load(pte)) & PG_A) != 0) { pte_store(pte, v & ~PG_A); pmap_invalidate_page(pv->pv_pmap, pv->pv_va); rtval++; if (rtval > 4) { break; } } } while ((pv = pvn) != NULL && pv != pvf); } splx(s); return (rtval); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { pmap_changebit(m, PG_M, FALSE); } /* * pmap_clear_reference: * * Clear the reference bit on the specified physical page. */ void pmap_clear_reference(vm_page_t m) { pmap_changebit(m, PG_A, FALSE); } /* * Miscellaneous support routines follow */ static void i386_protection_init() { register int *kp, prot; kp = protection_codes; for (prot = 0; prot < 8; prot++) { switch (prot) { case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: /* * Read access is also 0. There isn't any execute bit, * so just make it readable. */ case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: *kp++ = 0; break; case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: *kp++ = PG_RW; break; } } } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev(pa, size) vm_paddr_t pa; vm_size_t size; { vm_offset_t va, tmpva, offset; offset = pa & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); GIANT_REQUIRED; va = kmem_alloc_pageable(kernel_map, size); if (!va) panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); pa = pa & PG_FRAME; for (tmpva = va; size > 0; ) { pmap_kenter(tmpva, pa); size -= PAGE_SIZE; tmpva += PAGE_SIZE; pa += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, va, tmpva); return ((void *)(va + offset)); } void pmap_unmapdev(va, size) vm_offset_t va; vm_size_t size; { vm_offset_t base, offset, tmpva; pt_entry_t *pte; base = va & PG_FRAME; offset = va & PAGE_MASK; size = roundup(offset + size, PAGE_SIZE); for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { pte = vtopte(tmpva); pte_clear(pte); } pmap_invalidate_range(kernel_pmap, va, tmpva); kmem_free(kernel_map, base, size); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap, addr) pmap_t pmap; vm_offset_t addr; { pt_entry_t *ptep, pte; vm_page_t m; int val = 0; ptep = pmap_pte_quick(pmap, addr); if (ptep == 0) { return 0; } if ((pte = *ptep) != 0) { vm_paddr_t pa; val = MINCORE_INCORE; if ((pte & PG_MANAGED) == 0) return val; pa = pte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); /* * Modified by us */ if (pte & PG_M) val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; else { /* * Modified by someone else */ vm_page_lock_queues(); if (m->dirty || pmap_is_modified(m)) val |= MINCORE_MODIFIED_OTHER; vm_page_unlock_queues(); } /* * Referenced by us */ if (pte & PG_A) val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; else { /* * Referenced by someone else */ vm_page_lock_queues(); if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { val |= MINCORE_REFERENCED_OTHER; vm_page_flag_set(m, PG_REFERENCED); } vm_page_unlock_queues(); } } return val; } void pmap_activate(struct thread *td) { struct proc *p = td->td_proc; pmap_t pmap; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); #if defined(SMP) atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else pmap->pm_active |= 1; #endif #ifdef PAE cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* XXXKSE this is wrong. * pmap_activate is for the current thread on the current cpu */ - if (p->p_flag & P_THREADED) { + if (p->p_flag & P_SA) { /* Make sure all other cr3 entries are updated. */ /* what if they are running? XXXKSE (maybe abort them) */ FOREACH_THREAD_IN_PROC(p, td) { td->td_pcb->pcb_cr3 = cr3; } } else { td->td_pcb->pcb_cr3 = cr3; } load_cr3(cr3); #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif critical_exit(); } vm_offset_t pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) { if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { return addr; } addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); return addr; } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return npte; } pte = pmap_pte_quick(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return npte; } #endif #if defined(DEBUG) static void pads(pmap_t pm); void pmap_pvdump(vm_offset_t pa); /* print address space of pmap*/ static void pads(pm) pmap_t pm; { int i, j; vm_paddr_t va; pt_entry_t *ptep; if (pm == kernel_pmap) return; for (i = 0; i < NPDEPTD; i++) if (pm->pm_pdir[i]) for (j = 0; j < NPTEPG; j++) { va = (i << PDRSHIFT) + (j << PAGE_SHIFT); if (pm == kernel_pmap && va < KERNBASE) continue; if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) continue; ptep = pmap_pte_quick(pm, va); if (pmap_pte_v(ptep)) printf("%x:%x ", va, *ptep); }; } void pmap_pvdump(pa) vm_paddr_t pa; { pv_entry_t pv; vm_page_t m; printf("pa %x", pa); m = PHYS_TO_VM_PAGE(pa); TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); pads(pv->pv_pmap); } printf(" "); } #endif Index: head/sys/i386/i386/sys_machdep.c =================================================================== --- head/sys/i386/i386/sys_machdep.c (revision 116360) +++ head/sys/i386/i386/sys_machdep.c (revision 116361) @@ -1,567 +1,567 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)sys_machdep.c 5.5 (Berkeley) 1/19/91 * */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* pcb.h included by sys/user.h */ #include #include #include /* for kernel_map */ #define MAX_LD 8192 #define LD_PER_PAGE 512 #define NEW_MAX_LD(num) ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1)) #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3) static int i386_get_ldt(struct thread *, char *); static int i386_set_ldt(struct thread *, char *); static int i386_get_ioperm(struct thread *, char *); static int i386_set_ioperm(struct thread *, char *); #ifdef SMP static void set_user_ldt_rv(struct thread *); #endif #ifndef _SYS_SYSPROTO_H_ struct sysarch_args { int op; char *parms; }; #endif int sysarch(td, uap) struct thread *td; register struct sysarch_args *uap; { int error; mtx_lock(&Giant); switch(uap->op) { case I386_GET_LDT: error = i386_get_ldt(td, uap->parms); break; case I386_SET_LDT: error = i386_set_ldt(td, uap->parms); break; case I386_GET_IOPERM: error = i386_get_ioperm(td, uap->parms); break; case I386_SET_IOPERM: error = i386_set_ioperm(td, uap->parms); break; case I386_VM86: error = vm86_sysarch(td, uap->parms); break; default: error = EINVAL; break; } mtx_unlock(&Giant); return (error); } int i386_extend_pcb(struct thread *td) { int i, offset; u_long *addr; struct pcb_ext *ext; struct soft_segment_descriptor ssd = { 0, /* segment base address (overwritten) */ ctob(IOPAGES + 1) - 1, /* length */ SDT_SYS386TSS, /* segment type */ 0, /* priority level */ 1, /* descriptor present */ 0, 0, 0, /* default 32 size */ 0 /* granularity */ }; - if (td->td_proc->p_flag & P_THREADED) + if (td->td_proc->p_flag & P_SA) return (EINVAL); /* XXXKSE */ /* XXXKSE All the code below only works in 1:1 needs changing */ ext = (struct pcb_ext *)kmem_alloc(kernel_map, ctob(IOPAGES+1)); if (ext == 0) return (ENOMEM); bzero(ext, sizeof(struct pcb_ext)); /* -16 is so we can convert a trapframe into vm86trapframe inplace */ ext->ext_tss.tss_esp0 = td->td_kstack + ctob(KSTACK_PAGES) - sizeof(struct pcb) - 16; ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); /* * The last byte of the i/o map must be followed by an 0xff byte. * We arbitrarily allocate 16 bytes here, to keep the starting * address on a doubleword boundary. */ offset = PAGE_SIZE - 16; ext->ext_tss.tss_ioopt = (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16; ext->ext_iomap = (caddr_t)ext + offset; ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32; addr = (u_long *)ext->ext_vm86.vm86_intmap; for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++) *addr++ = ~0; ssd.ssd_base = (unsigned)&ext->ext_tss; ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext); ssdtosd(&ssd, &ext->ext_tssd); KASSERT(td->td_proc == curthread->td_proc, ("giving TSS to !curproc")); KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!")); mtx_lock_spin(&sched_lock); td->td_pcb->pcb_ext = ext; /* switch to the new TSS after syscall completes */ td->td_flags |= TDF_NEEDRESCHED; mtx_unlock_spin(&sched_lock); return 0; } static int i386_set_ioperm(td, args) struct thread *td; char *args; { int i, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); #ifdef MAC if ((error = mac_check_sysarch_ioperm(td->td_ucred)) != 0) return (error); #endif if ((error = suser(td)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); /* * XXX * While this is restricted to root, we should probably figure out * whether any other driver is using this i/o address, as so not to * cause confusion. This probably requires a global 'usage registry'. */ if (td->td_pcb->pcb_ext == 0) if ((error = i386_extend_pcb(td)) != 0) return (error); iomap = (char *)td->td_pcb->pcb_ext->ext_iomap; if (ua.start + ua.length > IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); for (i = ua.start; i < ua.start + ua.length; i++) { if (ua.enable) iomap[i >> 3] &= ~(1 << (i & 7)); else iomap[i >> 3] |= (1 << (i & 7)); } return (error); } static int i386_get_ioperm(td, args) struct thread *td; char *args; { int i, state, error; struct i386_ioperm_args ua; char *iomap; if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0) return (error); if (ua.start >= IOPAGES * PAGE_SIZE * NBBY) return (EINVAL); if (td->td_pcb->pcb_ext == 0) { ua.length = 0; goto done; } iomap = (char *)td->td_pcb->pcb_ext->ext_iomap; i = ua.start; state = (iomap[i >> 3] >> (i & 7)) & 1; ua.enable = !state; ua.length = 1; for (i = ua.start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) { if (state != ((iomap[i >> 3] >> (i & 7)) & 1)) break; ua.length++; } done: error = copyout(&ua, args, sizeof(struct i386_ioperm_args)); return (error); } /* * Update the GDT entry pointing to the LDT to point to the LDT of the * current process. * * This must be called with sched_lock held. Unfortunately, we can't use a * mtx_assert() here because cpu_switch() calls this function after changing * curproc but before sched_lock's owner is updated in mi_switch(). */ void set_user_ldt(struct mdproc *mdp) { struct proc_ldt *pldt; pldt = mdp->md_ldt; #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd; #else gdt[GUSERLDT_SEL].sd = pldt->ldt_sd; #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); } #ifdef SMP static void set_user_ldt_rv(struct thread *td) { if (td->td_proc != curthread->td_proc) return; mtx_lock_spin(&sched_lock); set_user_ldt(&td->td_proc->p_md); mtx_unlock_spin(&sched_lock); } #endif /* * Must be called with either sched_lock free or held but not recursed. * If it does not return NULL, it will return with it owned. */ struct proc_ldt * user_ldt_alloc(struct mdproc *mdp, int len) { struct proc_ldt *pldt, *new_ldt; if (mtx_owned(&sched_lock)) mtx_unlock_spin(&sched_lock); mtx_assert(&sched_lock, MA_NOTOWNED); MALLOC(new_ldt, struct proc_ldt *, sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK); new_ldt->ldt_len = len = NEW_MAX_LD(len); new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, len * sizeof(union descriptor)); if (new_ldt->ldt_base == NULL) { FREE(new_ldt, M_SUBPROC); return NULL; } new_ldt->ldt_refcnt = 1; new_ldt->ldt_active = 0; mtx_lock_spin(&sched_lock); gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base; gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1; ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd); if ((pldt = mdp->md_ldt)) { if (len > pldt->ldt_len) len = pldt->ldt_len; bcopy(pldt->ldt_base, new_ldt->ldt_base, len * sizeof(union descriptor)); } else { bcopy(ldt, new_ldt->ldt_base, sizeof(ldt)); } return new_ldt; } /* * Must be called either with sched_lock free or held but not recursed. * If md_ldt is not NULL, it will return with sched_lock released. */ void user_ldt_free(struct thread *td) { struct mdproc *mdp = &td->td_proc->p_md; struct proc_ldt *pldt = mdp->md_ldt; if (pldt == NULL) return; if (!mtx_owned(&sched_lock)) mtx_lock_spin(&sched_lock); mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); if (td == PCPU_GET(curthread)) { lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); } mdp->md_ldt = NULL; if (--pldt->ldt_refcnt == 0) { mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base, pldt->ldt_len * sizeof(union descriptor)); FREE(pldt, M_SUBPROC); } else mtx_unlock_spin(&sched_lock); } static int i386_get_ldt(td, args) struct thread *td; char *args; { int error = 0; struct proc_ldt *pldt = td->td_proc->p_md.md_ldt; int nldt, num; union descriptor *lp; struct i386_ldt_args ua, *uap = &ua; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_get_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of LDTs exist */ if ((uap->start < 0) || (uap->num <= 0)) return(EINVAL); if (pldt) { nldt = pldt->ldt_len; num = min(uap->num, nldt); lp = &((union descriptor *)(pldt->ldt_base))[uap->start]; } else { nldt = sizeof(ldt)/sizeof(ldt[0]); num = min(uap->num, nldt); lp = &ldt[uap->start]; } if (uap->start + num > nldt) return(EINVAL); error = copyout(lp, uap->descs, num * sizeof(union descriptor)); if (!error) td->td_retval[0] = num; return(error); } static int i386_set_ldt(td, args) struct thread *td; char *args; { int error = 0, i, n; int largest_ld; struct mdproc *mdp = &td->td_proc->p_md; struct proc_ldt *pldt = mdp->md_ldt; struct i386_ldt_args ua, *uap = &ua; union descriptor *descs; caddr_t old_ldt_base; int descs_size, old_ldt_len; register_t savecrit; if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0) return(error); #ifdef DEBUG printf("i386_set_ldt: start=%d num=%d descs=%p\n", uap->start, uap->num, (void *)uap->descs); #endif /* verify range of descriptors to modify */ if ((uap->start < 0) || (uap->start >= MAX_LD) || (uap->num < 0) || (uap->num > MAX_LD)) { return(EINVAL); } largest_ld = uap->start + uap->num - 1; if (largest_ld >= MAX_LD) return(EINVAL); /* allocate user ldt */ if (!pldt || largest_ld >= pldt->ldt_len) { struct proc_ldt *new_ldt = user_ldt_alloc(mdp, largest_ld); if (new_ldt == NULL) return ENOMEM; if (pldt) { old_ldt_base = pldt->ldt_base; old_ldt_len = pldt->ldt_len; pldt->ldt_sd = new_ldt->ldt_sd; pldt->ldt_base = new_ldt->ldt_base; pldt->ldt_len = new_ldt->ldt_len; mtx_unlock_spin(&sched_lock); kmem_free(kernel_map, (vm_offset_t)old_ldt_base, old_ldt_len * sizeof(union descriptor)); FREE(new_ldt, M_SUBPROC); #ifndef SMP mtx_lock_spin(&sched_lock); #endif } else { mdp->md_ldt = pldt = new_ldt; #ifdef SMP mtx_unlock_spin(&sched_lock); #endif } #ifdef SMP /* signal other cpus to reload ldt */ smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv, NULL, td); #else set_user_ldt(mdp); mtx_unlock_spin(&sched_lock); #endif } descs_size = uap->num * sizeof(union descriptor); descs = (union descriptor *)kmem_alloc(kernel_map, descs_size); if (descs == NULL) return (ENOMEM); error = copyin(&uap->descs[0], descs, descs_size); if (error) { kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return (error); } /* Check descriptors for access violations */ for (i = 0, n = uap->start; i < uap->num; i++, n++) { union descriptor *dp; dp = &descs[i]; switch (dp->sd.sd_type) { case SDT_SYSNULL: /* system null */ dp->sd.sd_p = 0; break; case SDT_SYS286TSS: /* system 286 TSS available */ case SDT_SYSLDT: /* system local descriptor table */ case SDT_SYS286BSY: /* system 286 TSS busy */ case SDT_SYSTASKGT: /* system task gate */ case SDT_SYS286IGT: /* system 286 interrupt gate */ case SDT_SYS286TGT: /* system 286 trap gate */ case SDT_SYSNULL2: /* undefined by Intel */ case SDT_SYS386TSS: /* system 386 TSS available */ case SDT_SYSNULL3: /* undefined by Intel */ case SDT_SYS386BSY: /* system 386 TSS busy */ case SDT_SYSNULL4: /* undefined by Intel */ case SDT_SYS386IGT: /* system 386 interrupt gate */ case SDT_SYS386TGT: /* system 386 trap gate */ case SDT_SYS286CGT: /* system 286 call gate */ case SDT_SYS386CGT: /* system 386 call gate */ /* I can't think of any reason to allow a user proc * to create a segment of these types. They are * for OS use only. */ kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return EACCES; /*NOTREACHED*/ /* memory segment types */ case SDT_MEMEC: /* memory execute only conforming */ case SDT_MEMEAC: /* memory execute only accessed conforming */ case SDT_MEMERC: /* memory execute read conforming */ case SDT_MEMERAC: /* memory execute read accessed conforming */ /* Must be "present" if executable and conforming. */ if (dp->sd.sd_p == 0) { kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return (EACCES); } break; case SDT_MEMRO: /* memory read only */ case SDT_MEMROA: /* memory read only accessed */ case SDT_MEMRW: /* memory read write */ case SDT_MEMRWA: /* memory read write accessed */ case SDT_MEMROD: /* memory read only expand dwn limit */ case SDT_MEMRODA: /* memory read only expand dwn lim accessed */ case SDT_MEMRWD: /* memory read write expand dwn limit */ case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */ case SDT_MEME: /* memory execute only */ case SDT_MEMEA: /* memory execute only accessed */ case SDT_MEMER: /* memory execute read */ case SDT_MEMERA: /* memory execute read accessed */ break; default: kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return(EINVAL); /*NOTREACHED*/ } /* Only user (ring-3) descriptors may be present. */ if ((dp->sd.sd_p != 0) && (dp->sd.sd_dpl != SEL_UPL)) { kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return (EACCES); } } /* Fill in range */ savecrit = intr_disable(); bcopy(descs, &((union descriptor *)(pldt->ldt_base))[uap->start], uap->num * sizeof(union descriptor)); td->td_retval[0] = uap->start; intr_restore(savecrit); kmem_free(kernel_map, (vm_offset_t)descs, descs_size); return (0); } Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 116360) +++ head/sys/i386/i386/trap.c (revision 116361) @@ -1,1095 +1,1095 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_isa.h" #include "opt_ktrace.h" #include "opt_npx.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #include #include #include int (*pmath_emulate)(struct trapframe *); extern void trap(struct trapframe frame); #ifdef I386_CPU extern int trapwrite(unsigned addr); #endif extern void syscall(struct trapframe frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif #ifdef DDB static int ddb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, &ddb_on_nmi, 0, "Go to DDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif #ifdef DEVICE_POLLING extern u_int32_t poll_in_trap; extern int ether_poll(int count); #endif /* DEVICE_POLLING */ /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct thread *td = curthread; struct proc *p = td->td_proc; u_int sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif atomic_add_int(&cnt.v_trap, 1); type = frame.tf_trapno; #ifdef DDB if (db_active) { eva = (type == T_PAGEFLT ? rcr2() : 0); trap_fatal(&frame, eva); goto out; } #endif if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP && frame.tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts diasabled until later, * and we shouldn't enable interrupts while in a * critical section. */ if (type != T_PAGEFLT && td->td_critnest == 0) enable_intr(); } } eva = 0; code = frame.tf_err; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. * * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. */ eva = rcr2(); if (td->td_critnest == 0) enable_intr(); else trap_fatal(&frame, eva); } #ifdef DEVICE_POLLING if (poll_in_trap) ether_poll(poll_in_trap); #endif /* DEVICE_POLLING */ if ((ISPL(frame.tf_cs) == SEL_UPL) || ((frame.tf_eflags & PSL_VM) && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) { /* user trap */ sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ #ifdef DEV_NPX ucode = npxtrap(); if (ucode == -1) goto userout; #else ucode = code; #endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) goto user; break; } /* FALLTHROUGH */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame.tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = type; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto userout; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; #endif if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; } mtx_lock(&Giant); i = (*pmath_emulate)(&frame); mtx_unlock(&Giant); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) goto userout; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } /* else ucode = emulator_only_knows() XXX */ break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); goto out; case T_DNA: #ifdef DEV_NPX /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) goto out; #endif break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)&frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame.tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; #if 0 PROC_LOCK(p); psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_eip == (int)doreti_iret) { frame.tf_eip = (int)doreti_iret_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_ds) { frame.tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_es) { frame.tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_fs) { frame.tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ /* XXX Giant */ if (user_dbreg_trap() && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If DDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef DDB /* XXX Giant */ if (kdb_trap (type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef DDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (ddb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap (type, 0, &frame); } #endif /* DDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(&frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(td, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, &frame, sticks); mtx_assert(&Giant, MA_NOTOWNED); userout: #ifdef DIAGNOSTIC cred_free_thread(td); #endif out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; #endif if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && PCPU_GET(curpcb) != NULL && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss, esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if (kdb_trap(&psl)) return; #endif #ifdef DDB if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("lapic.id = %08x\n", lapic.id); #endif panic("double fault"); } #ifdef I386_CPU /* * Compensate for 386 brain damage (missing URKR). * This is a little simpler than the pagefault handler in trap() because * it the page tables have already been faulted in and high addresses * are thrown out early for other reasons. */ int trapwrite(addr) unsigned addr; { struct thread *td; struct proc *p; vm_offset_t va; struct vmspace *vm; int rv; va = trunc_page((vm_offset_t)addr); /* * XXX - MAX is END. Changed > to >= for temp. fix. */ if (va >= VM_MAXUSER_ADDRESS) return (1); td = curthread; p = td->td_proc; vm = p->p_vmspace; PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* * fault the data page */ rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); if (rv != KERN_SUCCESS) return 1; return (0); } #endif /* * syscall - system call request C handler * * A system call is essentially treated as a trap. */ void syscall(frame) struct trapframe frame; { caddr_t params; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_tf_eflags; u_int sticks; int error; int narg; int args[8]; u_int code; /* * note: PCPU_LAZY_INC() can only be used if we can afford * occassional inaccuracy in the count. */ PCPU_LAZY_INC(cnt.v_syscall); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); /* try to stabilize the system XXX */ panic("syscall"); /* NOT REACHED */ mtx_unlock(&Giant); } #endif sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_user_enter(p, td); params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; orig_tf_eflags = frame.tf_eflags; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ if (params != NULL && narg != 0) error = copyin(params, (caddr_t)args, (u_int)(narg * sizeof(int))); else error = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, args); #endif /* * Try to run the syscall without Giant if the syscall * is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame.tf_edx; STOPEVENT(p, S_SCE, narg); error = (*callp->sy_call)(td, args); } switch (error) { case 0: frame.tf_eax = td->td_retval[0]; frame.tf_edx = td->td_retval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame.tf_eip -= frame.tf_err; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } /* * Release Giant if we previously set it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame.tf_eflags &= ~PSL_T; trapsignal(td, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(td, &frame, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/i386/linux/linux_sysvec.c =================================================================== --- head/sys/i386/linux/linux_sysvec.c (revision 116360) +++ head/sys/i386/linux/linux_sysvec.c (revision 116361) @@ -1,973 +1,973 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #ifndef COMPAT_43 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); MODULE_DEPEND(linux, sysvmsg, 1, 1, 1); MODULE_DEPEND(linux, sysvsem, 1, 1, 1); MODULE_DEPEND(linux, sysvshm, 1, 1, 1); MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 #define LINUX_SYS_linux_sendsig 0 extern char linux_sigcode[]; extern int linux_szsigcode; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(register_t **stack_base, struct image_params *iparams); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params); static void linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code); static void exec_linux_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings); /* * Linux syscalls return negative errno's, we do positive and map them */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75, -6, -84 }; int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 0, LINUX_SIGUSR1, LINUX_SIGUSR2 }; int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGTRAP, SIGABRT, SIGBUS, SIGFPE, SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, SIGPIPE, SIGALRM, SIGTERM, SIGBUS, SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, SIGIO, SIGURG, SIGSYS }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)argc + 1); (*stack_base)--; **stack_base = (intptr_t)(void *)envp; (*stack_base)--; **stack_base = (intptr_t)(void *)argv; (*stack_base)--; **stack_base = imgp->argc; return 0; } static int elf_linux_fixup(register_t **stack_base, struct image_params *imgp) { Elf32_Auxargs *args; register_t *pos; KASSERT(curthread->td_proc == imgp->proc && - (curthread->td_proc->p_flag & P_THREADED) == 0, + (curthread->td_proc->p_flag & P_SA) == 0, ("unsafe elf_linux_fixup(), should be curproc")); args = (Elf32_Auxargs *)imgp->auxargs; pos = *stack_base + (imgp->argc + imgp->envc + 2); if (args->trace) AUXARGS_ENTRY(pos, AT_DEBUG, 1); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; **stack_base = (register_t)imgp->argc; return 0; } extern int _ucodesel, _udatasel; extern unsigned long linux_sznonrtsigcode; static void linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts */ frame.sf_si.lsi_signo = sig; frame.sf_si.lsi_code = code; frame.sf_si.lsi_addr = (void *)regs->tf_err; /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp; frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp, p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) + linux_sznonrtsigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack, i; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, sig, mask, code); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %lu"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((p->p_flag & P_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)(p->p_sigstk.ss_sp + p->p_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) frame.sf_extramask[i] = lmask.__bits[i+1]; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_eflags &= ~(PSL_T | PSL_VM); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct proc *p = td->td_proc; struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; int eflags, i; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; /* * XXX do allow users to change the privileged flag PSL_RF. The * cpu sets PSL_RF in tf_eflags for faults. Debuggers should * sometimes set it there too. tf_eflags is kept in the signal * context during signal handling and there is no other place * to remember it, so the PSL_RF bit may be corrupted by the * signal handler without us knowing. Corruption of the PSL_RF * bit at worst causes one more or one less debugger trap, so * allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { trapsignal(td, SIGBUS, T_PROTFLT); return(EINVAL); } lmask.__bits[0] = frame.sf_sc.sc_mask; for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) lmask.__bits[i+1] = frame.sf_extramask[i]; PROC_LOCK(p); linux_to_bsd_sigset(&lmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); /* * Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p = td->td_proc; struct l_ucontext uc; struct l_sigcontext *context; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; /* * XXX do allow users to change the privileged flag PSL_RF. The * cpu sets PSL_RF in tf_eflags for faults. Debuggers should * sometimes set it there too. tf_eflags is kept in the signal * context during signal handling and there is no other place * to remember it, so the PSL_RF bit may be corrupted by the * signal handler without us knowing. Corruption of the PSL_RF * bit at worst causes one more or one less debugger trap, so * allowing it is fairly harmless. */ if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { trapsignal(td, SIGBUS, T_PROTFLT); return(EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); /* * Restore signal context */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = lss->ss_sp; ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } /* * MPSAFE */ static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params) { args[0] = tf->tf_ebx; args[1] = tf->tf_ecx; args[2] = tf->tf_edx; args[3] = tf->tf_esi; args[4] = tf->tf_edi; args[5] = tf->tf_ebp; /* Unconfirmed */ *params = NULL; /* no copyin */ } /* * Dump core, into a file named as described in the comments for * expand_name(), unless the process was setuid/setgid. */ static int linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vmspace *vm = p->p_vmspace; char *tempuser; int error; if (ctob((uarea_pages + kstack_pages) + vm->vm_dsize + vm->vm_ssize) >= limit) return (EFAULT); tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP, M_WAITOK | M_ZERO); if (tempuser == NULL) return (ENOMEM); PROC_LOCK(p); fill_kinfo_proc(p, &p->p_uarea->u_kproc); PROC_UNLOCK(p); bcopy(p->p_uarea, tempuser, sizeof(struct user)); bcopy(td->td_frame, tempuser + ctob(uarea_pages) + ((caddr_t)td->td_frame - (caddr_t)td->td_kstack), sizeof(struct trapframe)); error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser, ctob(uarea_pages + kstack_pages), (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED, (int *)NULL, td); free(tempuser, M_TEMP); if (error == 0) error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, (int)ctob(vm->vm_dsize), (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td); if (error == 0) error = vn_rdwr_inchunks(UIO_WRITE, vp, (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)), (off_t)ctob(uarea_pages + kstack_pages) + ctob(vm->vm_dsize), UIO_USERSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td); return (error); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { char *rpath = NULL; linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL, imgp->interpreter_name, &rpath, 0); if (rpath != imgp->interpreter_name) { int len = strlen(rpath) + 1; if (len <= MAXSHELLCMDLEN) { memcpy(imgp->interpreter_name, rpath, len); } free(rpath, M_TEMP); } } } return(error); } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void exec_linux_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings) { struct pcb *pcb = td->td_pcb; exec_setregs(td, entry, stack, ps_strings); /* Linux sets %gs to 0, we default to _udatasel */ pcb->pcb_gs = 0; load_gs(0); } struct sysentvec linux_sysvec = { LINUX_SYS_MAXSYSCALL, linux_sysent, 0xff, LINUX_SIGTBLSZ, bsd_to_linux_signal, ELAST + 1, bsd_to_linux_errno, translate_traps, linux_fixup, linux_sendsig, linux_sigcode, &linux_szsigcode, linux_prepsyscall, "Linux a.out", linux_aout_coredump, exec_linux_imgact_try, LINUX_MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_linux_setregs }; struct sysentvec elf_linux_sysvec = { LINUX_SYS_MAXSYSCALL, linux_sysent, 0xff, LINUX_SIGTBLSZ, bsd_to_linux_signal, ELAST + 1, bsd_to_linux_errno, translate_traps, elf_linux_fixup, linux_sendsig, linux_sigcode, &linux_szsigcode, linux_prepsyscall, "Linux ELF", elf32_coredump, exec_linux_imgact_try, LINUX_MINSIGSTKSZ, PAGE_SIZE, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK, PS_STRINGS, VM_PROT_ALL, exec_copyout_strings, exec_linux_setregs }; static Elf32_Brandinfo linux_brand = { ELFOSABI_LINUX, EM_386, "Linux", "/compat/linux", "/lib/ld-linux.so.1", &elf_linux_sysvec }; static Elf32_Brandinfo linux_glibc2brand = { ELFOSABI_LINUX, EM_386, "Linux", "/compat/linux", "/lib/ld-linux.so.2", &elf_linux_sysvec }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); linux_mib_destroy(); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: break; } return error; } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/ia64/ia64/trap.c =================================================================== --- head/sys/ia64/ia64/trap.c (revision 116360) +++ head/sys/ia64/ia64/trap.c (revision 116361) @@ -1,1216 +1,1216 @@ /* $FreeBSD$ */ /* From: src/sys/alpha/alpha/trap.c,v 1.33 */ /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */ /* * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University. * All rights reserved. * * Author: Chris G. Demetriou * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef KTRACE #include #include #endif #ifdef DDB #include #endif static int print_usertrap = 0; SYSCTL_INT(_machdep, OID_AUTO, print_usertrap, CTLFLAG_RW, &print_usertrap, 0, ""); extern int unaligned_fixup(struct trapframe *framep, struct thread *td); static void break_syscall(struct trapframe *tf); static void ia32_syscall(struct trapframe *framep); /* * EFI-Provided FPSWA interface (Floating Point SoftWare Assist */ /* The function entry address */ extern FPSWA_INTERFACE *fpswa_interface; /* Copy of the faulting instruction bundle */ typedef struct { u_int64_t bundle_low64; u_int64_t bundle_high64; } FPSWA_BUNDLE; /* * The fp state descriptor... tell FPSWA where the "true" copy is. * We save some registers in the trapframe, so we have to point some of * these there. The rest of the registers are "live" */ typedef struct { u_int64_t bitmask_low64; /* f63 - f2 */ u_int64_t bitmask_high64; /* f127 - f64 */ struct _ia64_fpreg *fp_low_preserved; /* f2 - f5 */ struct _ia64_fpreg *fp_low_volatile; /* f6 - f15 */ struct _ia64_fpreg *fp_high_preserved; /* f16 - f31 */ struct _ia64_fpreg *fp_high_volatile; /* f32 - f127 */ } FP_STATE; #ifdef WITNESS extern char *syscallnames[]; #endif static const char *ia64_vector_names[] = { "VHPT Translation", /* 0 */ "Instruction TLB", /* 1 */ "Data TLB", /* 2 */ "Alternate Instruction TLB", /* 3 */ "Alternate Data TLB", /* 4 */ "Data Nested TLB", /* 5 */ "Instruction Key Miss", /* 6 */ "Data Key Miss", /* 7 */ "Dirty-Bit", /* 8 */ "Instruction Access-Bit", /* 9 */ "Data Access-Bit", /* 10 */ "Break Instruction", /* 11 */ "External Interrupt", /* 12 */ "Reserved 13", /* 13 */ "Reserved 14", /* 14 */ "Reserved 15", /* 15 */ "Reserved 16", /* 16 */ "Reserved 17", /* 17 */ "Reserved 18", /* 18 */ "Reserved 19", /* 19 */ "Page Not Present", /* 20 */ "Key Permission", /* 21 */ "Instruction Access Rights", /* 22 */ "Data Access Rights", /* 23 */ "General Exception", /* 24 */ "Disabled FP-Register", /* 25 */ "NaT Consumption", /* 26 */ "Speculation", /* 27 */ "Reserved 28", /* 28 */ "Debug", /* 29 */ "Unaligned Reference", /* 30 */ "Unsupported Data Reference", /* 31 */ "Floating-point Fault", /* 32 */ "Floating-point Trap", /* 33 */ "Lower-Privilege Transfer Trap", /* 34 */ "Taken Branch Trap", /* 35 */ "Single Step Trap", /* 36 */ "Reserved 37", /* 37 */ "Reserved 38", /* 38 */ "Reserved 39", /* 39 */ "Reserved 40", /* 40 */ "Reserved 41", /* 41 */ "Reserved 42", /* 42 */ "Reserved 43", /* 43 */ "Reserved 44", /* 44 */ "IA-32 Exception", /* 45 */ "IA-32 Intercept", /* 46 */ "IA-32 Interrupt", /* 47 */ "Reserved 48", /* 48 */ "Reserved 49", /* 49 */ "Reserved 50", /* 50 */ "Reserved 51", /* 51 */ "Reserved 52", /* 52 */ "Reserved 53", /* 53 */ "Reserved 54", /* 54 */ "Reserved 55", /* 55 */ "Reserved 56", /* 56 */ "Reserved 57", /* 57 */ "Reserved 58", /* 58 */ "Reserved 59", /* 59 */ "Reserved 60", /* 60 */ "Reserved 61", /* 61 */ "Reserved 62", /* 62 */ "Reserved 63", /* 63 */ "Reserved 64", /* 64 */ "Reserved 65", /* 65 */ "Reserved 66", /* 66 */ "Reserved 67", /* 67 */ }; struct bitname { u_int64_t mask; const char* name; }; static void printbits(u_int64_t mask, struct bitname *bn, int count) { int i, first = 1; u_int64_t bit; for (i = 0; i < count; i++) { /* * Handle fields wider than one bit. */ bit = bn[i].mask & ~(bn[i].mask - 1); if (bn[i].mask > bit) { if (first) first = 0; else printf(","); printf("%s=%ld", bn[i].name, (mask & bn[i].mask) / bit); } else if (mask & bit) { if (first) first = 0; else printf(","); printf("%s", bn[i].name); } } } struct bitname psr_bits[] = { {IA64_PSR_BE, "be"}, {IA64_PSR_UP, "up"}, {IA64_PSR_AC, "ac"}, {IA64_PSR_MFL, "mfl"}, {IA64_PSR_MFH, "mfh"}, {IA64_PSR_IC, "ic"}, {IA64_PSR_I, "i"}, {IA64_PSR_PK, "pk"}, {IA64_PSR_DT, "dt"}, {IA64_PSR_DFL, "dfl"}, {IA64_PSR_DFH, "dfh"}, {IA64_PSR_SP, "sp"}, {IA64_PSR_PP, "pp"}, {IA64_PSR_DI, "di"}, {IA64_PSR_SI, "si"}, {IA64_PSR_DB, "db"}, {IA64_PSR_LP, "lp"}, {IA64_PSR_TB, "tb"}, {IA64_PSR_RT, "rt"}, {IA64_PSR_CPL, "cpl"}, {IA64_PSR_IS, "is"}, {IA64_PSR_MC, "mc"}, {IA64_PSR_IT, "it"}, {IA64_PSR_ID, "id"}, {IA64_PSR_DA, "da"}, {IA64_PSR_DD, "dd"}, {IA64_PSR_SS, "ss"}, {IA64_PSR_RI, "ri"}, {IA64_PSR_ED, "ed"}, {IA64_PSR_BN, "bn"}, {IA64_PSR_IA, "ia"}, }; static void printpsr(u_int64_t psr) { printbits(psr, psr_bits, sizeof(psr_bits)/sizeof(psr_bits[0])); } struct bitname isr_bits[] = { {IA64_ISR_CODE, "code"}, {IA64_ISR_VECTOR, "vector"}, {IA64_ISR_X, "x"}, {IA64_ISR_W, "w"}, {IA64_ISR_R, "r"}, {IA64_ISR_NA, "na"}, {IA64_ISR_SP, "sp"}, {IA64_ISR_RS, "rs"}, {IA64_ISR_IR, "ir"}, {IA64_ISR_NI, "ni"}, {IA64_ISR_SO, "so"}, {IA64_ISR_EI, "ei"}, {IA64_ISR_ED, "ed"}, }; static void printisr(u_int64_t isr) { printbits(isr, isr_bits, sizeof(isr_bits)/sizeof(isr_bits[0])); } static void printtrap(int vector, struct trapframe *framep, int isfatal, int user) { printf("\n"); printf("%s %s trap (cpu %d):\n", isfatal? "fatal" : "handled", user ? "user" : "kernel", PCPU_GET(cpuid)); printf("\n"); printf(" trap vector = 0x%x (%s)\n", vector, ia64_vector_names[vector]); printf(" cr.iip = 0x%lx\n", framep->tf_special.iip); printf(" cr.ipsr = 0x%lx (", framep->tf_special.psr); printpsr(framep->tf_special.psr); printf(")\n"); printf(" cr.isr = 0x%lx (", framep->tf_special.isr); printisr(framep->tf_special.isr); printf(")\n"); printf(" cr.ifa = 0x%lx\n", framep->tf_special.ifa); if (framep->tf_special.psr & IA64_PSR_IS) { printf(" ar.cflg = 0x%lx\n", ia64_get_cflg()); printf(" ar.csd = 0x%lx\n", ia64_get_csd()); printf(" ar.ssd = 0x%lx\n", ia64_get_ssd()); } printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_proc->p_comm); printf("\n"); } /* * */ int do_ast(struct trapframe *tf) { disable_intr(); while (curthread->td_flags & (TDF_ASTPENDING|TDF_NEEDRESCHED)) { enable_intr(); ast(tf); disable_intr(); } /* * Keep interrupts disabled. We return r10 as a favor to the EPC * syscall code so that it can quicky determine if the syscall * needs to be restarted or not. */ return (tf->tf_scratch.gr10); } /* * Trap is called from exception.s to handle most types of processor traps. */ /*ARGSUSED*/ void trap(int vector, struct trapframe *framep) { struct proc *p; struct thread *td; u_int64_t ucode; int i, user; u_int sticks; user = ((framep->tf_special.iip >> 61) < 5) ? 1 : 0; /* Short-circuit break instruction based system calls. */ if (vector == IA64_VEC_BREAK && framep->tf_special.ifa == 0x100000) { break_syscall(framep); return; } /* Sanitize the FP state in case the user has trashed it. */ ia64_set_fpsr(IA64_FPSR_DEFAULT); atomic_add_int(&cnt.v_trap, 1); td = curthread; p = td->td_proc; ucode = 0; if (user) { sticks = td->td_sticks; td->td_frame = framep; if (td->td_ucred != p->p_ucred) cred_update_thread(td); } else { sticks = 0; /* XXX bogus -Wuninitialized warning */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); } switch (vector) { case IA64_VEC_UNALIGNED_REFERENCE: { /* * If user-land, do whatever fixups, printing, and * signalling is appropriate (based on system-wide * and per-process unaligned-access-handling flags). */ if (user) { i = unaligned_fixup(framep, td); if (i == 0) goto out; ucode = framep->tf_special.ifa; /* VA */ break; } /* * Unaligned access from kernel mode is always an error, * EVEN IF A COPY FAULT HANDLER IS SET! * * It's an error if a copy fault handler is set because * the various routines which do user-initiated copies * do so in a bcopy-like manner. In other words, the * kernel never assumes that pointers provided by the * user are properly aligned, and so if the kernel * does cause an unaligned access it's a kernel bug. */ goto dopanic; } case IA64_VEC_FLOATING_POINT_FAULT: { FP_STATE fp_state; FPSWA_RET fpswa_ret; FPSWA_BUNDLE bundle; /* Always fatal in kernel. Should never happen. */ if (!user) goto dopanic; if (fpswa_interface == NULL) { i = SIGFPE; ucode = 0; break; } mtx_lock(&Giant); i = copyin((void *)(framep->tf_special.iip), &bundle, 16); mtx_unlock(&Giant); if (i) { i = SIGBUS; /* EFAULT, basically */ ucode = /*a0*/ 0; /* exception summary */ break; } /* f6-f15 are saved in exception_save */ fp_state.bitmask_low64 = 0xffc0; /* bits 6 - 15 */ fp_state.bitmask_high64 = 0x0; fp_state.fp_low_preserved = NULL; fp_state.fp_low_volatile = &framep->tf_scratch_fp.fr6; fp_state.fp_high_preserved = NULL; fp_state.fp_high_volatile = NULL; /* The docs are unclear. Is Fpswa reentrant? */ fpswa_ret = fpswa_interface->Fpswa(1, &bundle, &framep->tf_special.psr, &framep->tf_special.fpsr, &framep->tf_special.isr, &framep->tf_special.pr, &framep->tf_special.cfm, &fp_state); if (fpswa_ret.status == 0) { /* fixed. update ipsr and iip to next insn */ int ei; ei = (framep->tf_special.isr >> 41) & 0x03; if (ei == 0) { /* no template for this case */ framep->tf_special.psr &= ~IA64_ISR_EI; framep->tf_special.psr |= IA64_ISR_EI_1; } else if (ei == 1) { /* MFI or MFB */ framep->tf_special.psr &= ~IA64_ISR_EI; framep->tf_special.psr |= IA64_ISR_EI_2; } else if (ei == 2) { /* MMF */ framep->tf_special.psr &= ~IA64_ISR_EI; framep->tf_special.iip += 0x10; } goto out; } else if (fpswa_ret.status == -1) { printf("FATAL: FPSWA err1 %lx, err2 %lx, err3 %lx\n", fpswa_ret.err1, fpswa_ret.err2, fpswa_ret.err3); panic("fpswa fatal error on fp fault"); } else if (fpswa_ret.status > 0) { #if 0 if (fpswa_ret.status & 1) { /* * New exception needs to be raised. * If set then the following bits also apply: * & 2 -> fault was converted to a trap * & 4 -> SIMD caused the exception */ i = SIGFPE; ucode = /*a0*/ 0; /* exception summary */ break; } #endif i = SIGFPE; ucode = /*a0*/ 0; /* exception summary */ break; } else { panic("bad fpswa return code %lx", fpswa_ret.status); } } case IA64_VEC_FLOATING_POINT_TRAP: { FP_STATE fp_state; FPSWA_RET fpswa_ret; FPSWA_BUNDLE bundle; /* Always fatal in kernel. Should never happen. */ if (!user) goto dopanic; if (fpswa_interface == NULL) { i = SIGFPE; ucode = 0; break; } mtx_lock(&Giant); i = copyin((void *)(framep->tf_special.iip), &bundle, 16); mtx_unlock(&Giant); if (i) { i = SIGBUS; /* EFAULT, basically */ ucode = /*a0*/ 0; /* exception summary */ break; } /* f6-f15 are saved in exception_save */ fp_state.bitmask_low64 = 0xffc0; /* bits 6 - 15 */ fp_state.bitmask_high64 = 0x0; fp_state.fp_low_preserved = NULL; fp_state.fp_low_volatile = &framep->tf_scratch_fp.fr6; fp_state.fp_high_preserved = NULL; fp_state.fp_high_volatile = NULL; /* The docs are unclear. Is Fpswa reentrant? */ fpswa_ret = fpswa_interface->Fpswa(0, &bundle, &framep->tf_special.psr, &framep->tf_special.fpsr, &framep->tf_special.isr, &framep->tf_special.pr, &framep->tf_special.cfm, &fp_state); if (fpswa_ret.status == 0) { /* fixed */ /* * should we increment iip like the fault case? * or has fpswa done something like normalizing a * register so that we should just rerun it? */ goto out; } else if (fpswa_ret.status == -1) { printf("FATAL: FPSWA err1 %lx, err2 %lx, err3 %lx\n", fpswa_ret.err1, fpswa_ret.err2, fpswa_ret.err3); panic("fpswa fatal error on fp trap"); } else if (fpswa_ret.status > 0) { i = SIGFPE; ucode = /*a0*/ 0; /* exception summary */ break; } else { panic("bad fpswa return code %lx", fpswa_ret.status); } } case IA64_VEC_DISABLED_FP: { /* High FP registers are disabled. */ struct pcpu *pcpu; struct pcb *pcb; struct thread *thr; /* Always fatal in kernel. Should never happen. */ if (!user) goto dopanic; pcb = td->td_pcb; pcpu = pcb->pcb_fpcpu; #if 0 printf("XXX: td %p: highfp on cpu %p\n", td, pcpu); #endif /* * The pcpu variable holds the address of the per-CPU * structure of the CPU currently holding this threads * high FP registers (or NULL if no CPU holds these * registers). We have to interrupt that CPU and wait * for it to have saved the registers. */ if (pcpu != NULL) { thr = pcpu->pc_fpcurthread; KASSERT(thr == td, ("High FP state out of sync")); if (pcpu == pcpup) { /* * Short-circuit handling the trap when this * CPU already holds the high FP registers for * this thread. We really shouldn't get the * trap in the first place, but since it's * only a performance issue and not a * correctness issue, we emit a message for * now, enable the high FP registers and * return. */ printf("XXX: bogusly disabled high FP regs\n"); framep->tf_special.psr &= ~IA64_PSR_DFH; goto out; } #ifdef SMP /* * Interrupt the other CPU so that it saves the high * FP registers of this thread. Note that this can * only happen for the SMP case. */ ipi_send(pcpu->pc_lid, IPI_HIGH_FP); #endif #ifdef DIAGNOSTICS } else { KASSERT(PCPU_GET(fpcurthread) != td, ("High FP state out of sync")); #endif } thr = PCPU_GET(fpcurthread); #if 0 printf("XXX: cpu %p: highfp belongs to td %p\n", pcpup, thr); #endif /* * The thr variable holds the thread that owns the high FP * registers currently on this CPU. Free this CPU so that * we can load the current threads high FP registers. */ if (thr != NULL) { KASSERT(thr != td, ("High FP state out of sync")); pcb = thr->td_pcb; KASSERT(pcb->pcb_fpcpu == pcpup, ("High FP state out of sync")); ia64_highfp_save(thr); } /* * Wait for the other CPU to have saved out high FP * registers (if applicable). */ while (pcpu && pcpu->pc_fpcurthread == td); ia64_highfp_load(td); framep->tf_special.psr &= ~IA64_PSR_DFH; goto out; break; } case IA64_VEC_PAGE_NOT_PRESENT: case IA64_VEC_INST_ACCESS_RIGHTS: case IA64_VEC_DATA_ACCESS_RIGHTS: { vm_offset_t va; struct vmspace *vm; vm_map_t map; vm_prot_t ftype; int rv; rv = 0; va = framep->tf_special.ifa; /* * If it was caused by fuswintr or suswintr, just punt. Note * that we check the faulting address against the address * accessed by [fs]uswintr, in case another fault happens when * they are running. */ if (!user && td != NULL && td->td_pcb->pcb_accessaddr == va && td->td_pcb->pcb_onfault == (unsigned long)fswintrberr) { framep->tf_special.iip = td->td_pcb->pcb_onfault; framep->tf_special.psr &= ~IA64_PSR_RI; td->td_pcb->pcb_onfault = 0; goto out; } va = trunc_page((vm_offset_t)va); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults for kernel virtual * addresses */ if (user) goto no_fault_in; map = kernel_map; } else { vm = (p != NULL) ? p->p_vmspace : NULL; if (vm == NULL) goto no_fault_in; map = &vm->vm_map; } if (framep->tf_special.isr & IA64_ISR_X) ftype = VM_PROT_EXECUTE; else if (framep->tf_special.isr & IA64_ISR_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or * stacks in the kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) goto out; no_fault_in: /* * Additionally check the privilege level. We don't want to * panic when we're in the gateway page, running at user * level. This happens for the signal trampolines. Note that * when that happens, user is defined as 0 above. We need to * set user to 1 to force calling userret() and do_ast(). */ if (!TRAPF_USERMODE(framep)) { /* Check for copyin/copyout fault. */ if (td != NULL && td->td_pcb->pcb_onfault != 0) { framep->tf_special.iip = td->td_pcb->pcb_onfault; framep->tf_special.psr &= ~IA64_PSR_RI; td->td_pcb->pcb_onfault = 0; goto out; } goto dopanic; } else user = 1; ucode = va; i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; break; } case IA64_VEC_BREAK: case IA64_VEC_DEBUG: case IA64_VEC_SINGLE_STEP_TRAP: case IA64_VEC_TAKEN_BRANCH_TRAP: { /* * These are always fatal in kernel, and should never happen. */ if (!user) { #ifdef DDB /* * ...unless, of course, DDB is configured. */ if (kdb_trap(vector, framep)) return; /* * If we get here, DDB did _not_ handle the * trap, and we need to PANIC! */ #endif goto dopanic; } i = SIGTRAP; break; } case IA64_VEC_GENERAL_EXCEPTION: { if (user) { ucode = vector; i = SIGILL; break; } goto dopanic; } case IA64_VEC_UNSUPP_DATA_REFERENCE: case IA64_VEC_LOWER_PRIVILEGE_TRANSFER: { if (user) { ucode = vector; i = SIGBUS; break; } goto dopanic; } case IA64_VEC_IA32_EXCEPTION: { u_int64_t isr = framep->tf_special.isr; switch ((isr >> 16) & 0xffff) { case IA32_EXCEPTION_DIVIDE: ucode = FPE_INTDIV; i = SIGFPE; break; case IA32_EXCEPTION_DEBUG: case IA32_EXCEPTION_BREAK: i = SIGTRAP; break; case IA32_EXCEPTION_OVERFLOW: ucode = FPE_INTOVF; i = SIGFPE; break; case IA32_EXCEPTION_BOUND: ucode = FPE_FLTSUB; i = SIGFPE; break; case IA32_EXCEPTION_DNA: ucode = 0; i = SIGFPE; break; case IA32_EXCEPTION_NOT_PRESENT: case IA32_EXCEPTION_STACK_FAULT: case IA32_EXCEPTION_GPFAULT: ucode = (isr & 0xffff) + BUS_SEGM_FAULT; i = SIGBUS; break; case IA32_EXCEPTION_FPERROR: ucode = 0; /* XXX */ i = SIGFPE; break; case IA32_EXCEPTION_ALIGNMENT_CHECK: ucode = framep->tf_special.ifa; /* VA */ i = SIGBUS; break; case IA32_EXCEPTION_STREAMING_SIMD: ucode = 0; /* XXX */ i = SIGFPE; break; default: goto dopanic; } break; } case IA64_VEC_IA32_INTERRUPT: { /* * INT n instruction - probably a syscall. */ if (((framep->tf_special.isr >> 16) & 0xffff) == 0x80) { ia32_syscall(framep); goto out; } else { ucode = (framep->tf_special.isr >> 16) & 0xffff; i = SIGILL; break; } } case IA64_VEC_IA32_INTERCEPT: { /* * Maybe need to emulate ia32 instruction. */ goto dopanic; } default: goto dopanic; } if (print_usertrap) printtrap(vector, framep, 1, user); trapsignal(td, i, ucode); out: if (user) { userret(td, framep, sticks); mtx_assert(&Giant, MA_NOTOWNED); #ifdef DIAGNOSTIC cred_free_thread(td); #endif do_ast(framep); } return; dopanic: printtrap(vector, framep, 1, user); #ifdef DDB kdb_trap(vector, framep); #endif panic("trap"); } /* * Handle break instruction based system calls. */ void break_syscall(struct trapframe *tf) { uint64_t *bsp, *tfp; uint64_t iip, psr; int error, nargs; /* Save address of break instruction. */ iip = tf->tf_special.iip; psr = tf->tf_special.psr; /* Advance to the next instruction. */ tf->tf_special.psr += IA64_PSR_RI_1; if ((tf->tf_special.psr & IA64_PSR_RI) > IA64_PSR_RI_2) { tf->tf_special.iip += 16; tf->tf_special.psr &= ~IA64_PSR_RI; } /* * Copy the arguments on the register stack into the trapframe * to avoid having interleaved NaT collections. */ tfp = &tf->tf_scratch.gr16; nargs = tf->tf_special.cfm & 0x7f; bsp = (uint64_t*)(curthread->td_kstack + tf->tf_special.ndirty); bsp -= (((uintptr_t)bsp & 0x1ff) < (nargs << 3)) ? (nargs + 1): nargs; while (nargs--) { *tfp++ = *bsp++; if (((uintptr_t)bsp & 0x1ff) == 0x1f8) bsp++; } error = syscall(tf); if (error == ERESTART) { tf->tf_special.iip = iip; tf->tf_special.psr = psr; } do_ast(tf); } /* * Process a system call. * * See syscall.s for details as to how we get here. In order to support * the ERESTART case, we return the error to our caller. They deal with * the hairy details. */ int syscall(struct trapframe *tf) { struct sysent *callp; struct proc *p; struct thread *td; u_int64_t *args; int code, error; u_int sticks; code = tf->tf_scratch.gr15; args = &tf->tf_scratch.gr16; atomic_add_int(&cnt.v_syscall, 1); td = curthread; p = td->td_proc; td->td_frame = tf; sticks = td->td_sticks; if (td->td_ucred != p->p_ucred) cred_update_thread(td); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_user_enter(p, td); if (p->p_sysent->sv_prepsyscall) { /* (*p->p_sysent->sv_prepsyscall)(tf, args, &code, ¶ms); */ panic("prepsyscall"); } else { /* * syscall() and __syscall() are handled the same on * the ia64, as everything is 64-bit aligned, anyway. */ if (code == SYS_syscall || code == SYS___syscall) { /* * Code is first argument, followed by actual args. */ code = args[0]; args++; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; /* * Try to run the syscall without Giant if the syscall is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, (callp->sy_narg & SYF_ARGMASK), args); #endif td->td_retval[0] = 0; td->td_retval[1] = 0; tf->tf_scratch.gr10 = EJUSTRETURN; STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK)); error = (*callp->sy_call)(td, args); if (error != EJUSTRETURN) { /* * Save the "raw" error code in r10. We use this to handle * syscall restarts (see do_ast()). */ tf->tf_scratch.gr10 = error; if (error == 0) { tf->tf_scratch.gr8 = td->td_retval[0]; tf->tf_scratch.gr9 = td->td_retval[1]; } else if (error != ERESTART) { if (error < p->p_sysent->sv_errsize) error = p->p_sysent->sv_errtbl[error]; /* * Translated error codes are returned in r8. User * processes use the translated error code. */ tf->tf_scratch.gr8 = error; } } /* * Release Giant if we had to get it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); userret(td, tf, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); return (error); } #include static void ia32_syscall(struct trapframe *framep) { caddr_t params; int i; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_eflags; u_int sticks; int error; int narg; u_int32_t args[8]; u_int64_t args64[8]; u_int code; /* * note: PCPU_LAZY_INC() can only be used if we can afford * occassional inaccuracy in the count. */ cnt.v_syscall++; sticks = td->td_sticks; td->td_frame = framep; if (td->td_ucred != p->p_ucred) cred_update_thread(td); params = (caddr_t)(framep->tf_special.sp & ((1L<<32)-1)) + sizeof(u_int32_t); code = framep->tf_scratch.gr8; /* eax */ orig_eflags = ia64_get_eflag(); if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(framep, args, &code, ¶ms); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword32(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. * We use a 32-bit fetch in case params is not * aligned. */ code = fuword32(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ if (params != NULL && narg != 0) error = copyin(params, (caddr_t)args, (u_int)(narg * sizeof(int))); else error = 0; for (i = 0; i < narg; i++) args64[i] = args[i]; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, args64); #endif /* * Try to run the syscall without Giant if the syscall * is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = framep->tf_scratch.gr10; /* edx */ STOPEVENT(p, S_SCE, narg); error = (*callp->sy_call)(td, args64); } switch (error) { case 0: framep->tf_scratch.gr8 = td->td_retval[0]; /* eax */ framep->tf_scratch.gr10 = td->td_retval[1]; /* edx */ ia64_set_eflag(ia64_get_eflag() & ~PSL_C); break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. XXX Assume int 0x80. */ framep->tf_special.iip -= 2; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } framep->tf_scratch.gr8 = error; ia64_set_eflag(ia64_get_eflag() | PSL_C); break; } /* * Traced syscall. */ if ((orig_eflags & PSL_T) && !(orig_eflags & PSL_VM)) { ia64_set_eflag(ia64_get_eflag() & ~PSL_T); trapsignal(td, SIGTRAP, 0); } /* * Release Giant if we previously set it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); /* * Handle reschedule and other end-of-syscall issues */ userret(td, framep, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/kern_clock.c =================================================================== --- head/sys/kern/kern_clock.c (revision 116360) +++ head/sys/kern/kern_clock.c (revision 116361) @@ -1,483 +1,483 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #endif #ifdef DEVICE_POLLING extern void hardclock_device_poll(void); #endif /* DEVICE_POLLING */ static void initclocks(void *dummy); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) /* Some of these don't belong here, but it's easiest to concentrate them. */ long cp_time[CPUSTATES]; SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), "LU", "CPU time statistics"); /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. * * The main timer, running hz times per second, is used to trigger interval * timers, timeouts and rescheduling as needed. * * The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the cpu * just before its quantum expires. Otherwise, it would never accumulate * cpu ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) * * Time-of-day is maintained using a "timecounter", which may or may * not be related to the hardware generating the above mentioned * interrupts. */ int stathz; int profhz; int profprocs; int ticks; int psratio; /* * Initialize clock frequencies and start both clocks running. */ /* ARGSUSED*/ static void initclocks(dummy) void *dummy; { register int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ cpu_initclocks(); /* * Compute profhz/stathz, and fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; } /* * Each time the real-time timer fires, this function is called on all CPUs. * Note that hardclock() calls hardclock_process() for the boot CPU, so only * the other CPUs in the system need to call this function. */ void hardclock_process(frame) register struct clockframe *frame; { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; /* * Run current process's virtual and profile time, as needed. */ mtx_lock_spin_flags(&sched_lock, MTX_QUIET); - if (p->p_flag & P_THREADED) { + if (p->p_flag & P_SA) { /* XXXKSE What to do? */ } else { pstats = p->p_stats; if (CLKF_USERMODE(frame) && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { p->p_sflag |= PS_ALRMPEND; td->td_flags |= TDF_ASTPENDING; } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { p->p_sflag |= PS_PROFPEND; td->td_flags |= TDF_ASTPENDING; } } mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); } /* * The real-time timer, interrupting hz times per second. */ void hardclock(frame) register struct clockframe *frame; { int need_softclock = 0; CTR0(KTR_CLK, "hardclock fired"); hardclock_process(frame); tc_ticktock(); /* * If no separate statistics clock is available, run it from here. * * XXX: this only works for UP */ if (stathz == 0) { profclock(frame); statclock(frame); } #ifdef DEVICE_POLLING hardclock_device_poll(); /* this is very short and quick */ #endif /* DEVICE_POLLING */ /* * Process callouts at a very low cpu priority, so we don't keep the * relatively high clock interrupt priority any longer than necessary. */ mtx_lock_spin_flags(&callout_lock, MTX_QUIET); ticks++; if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { need_softclock = 1; } else if (softticks + 1 == ticks) ++softticks; mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); /* * swi_sched acquires sched_lock, so we don't want to call it with * callout_lock held; incorrect locking order. */ if (need_softclock) swi_sched(softclock_ih, 0); } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(tv) struct timeval *tv; { register unsigned long ticks; register long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) / tick + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + ((unsigned long)usec + (tick - 1)) / tick + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(p) register struct proc *p; { /* * XXX; Right now sched_lock protects statclock(), but perhaps * it should be protected later on by a time_lock, which would * cover psdiv, etc. as well. */ PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_STOPPROF) return; if ((p->p_flag & P_PROFIL) == 0) { mtx_lock_spin(&sched_lock); p->p_flag |= P_PROFIL; if (++profprocs == 1) cpu_startprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Stop profiling on a process. */ void stopprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_PROFIL) { if (p->p_profthreads != 0) { p->p_flag |= P_STOPPROF; while (p->p_profthreads != 0) msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, "stopprof", NULL); p->p_flag &= ~P_STOPPROF; } mtx_lock_spin(&sched_lock); p->p_flag &= ~P_PROFIL; if (--profprocs == 0) cpu_stopprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. Most of the statistics are only * used by user-level statistics programs. The main exceptions are * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. * This should be called by all active processors. */ void statclock(frame) register struct clockframe *frame; { struct pstats *pstats; struct rusage *ru; struct vmspace *vm; struct thread *td; struct kse *ke; struct proc *p; long rss; td = curthread; p = td->td_proc; mtx_lock_spin_flags(&sched_lock, MTX_QUIET); ke = td->td_kse; if (CLKF_USERMODE(frame)) { /* * Charge the time as appropriate. */ - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_statclock(1); p->p_uticks++; if (ke->ke_ksegrp->kg_nice > NZERO) cp_time[CP_NICE]++; else cp_time[CP_USER]++; } else { /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { p->p_iticks++; cp_time[CP_INTR]++; } else { - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_statclock(0); td->td_sticks++; p->p_sticks++; if (p != PCPU_GET(idlethread)->td_proc) cp_time[CP_SYS]++; else cp_time[CP_IDLE]++; } } sched_clock(ke); /* Update resource usage integrals and maximums. */ if ((pstats = p->p_stats) != NULL && (ru = &pstats->p_ru) != NULL && (vm = p->p_vmspace) != NULL) { ru->ru_ixrss += pgtok(vm->vm_tsize); ru->ru_idrss += pgtok(vm->vm_dsize); ru->ru_isrss += pgtok(vm->vm_ssize); rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; } mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); } void profclock(frame) register struct clockframe *frame; { struct thread *td; #ifdef GPROF struct gmonparam *g; int i; #endif td = curthread; if (CLKF_USERMODE(frame)) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. * if there is no related user location yet, don't * bother trying to count it. */ td = curthread; if (td->td_proc->p_flag & P_PROFIL) addupc_intr(td, CLKF_PC(frame), 1); } #ifdef GPROF else { /* * Kernel statistics are just like addupc_intr, only easier. */ g = &_gmonparam; if (g->state == GMON_PROF_ON) { i = CLKF_PC(frame) - g->lowpc; if (i < g->textsize) { i /= HISTFRACTION * sizeof(*g->kcount); g->kcount[i]++; } } } #endif } /* * Return information about system clocks. */ static int sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) { struct clockinfo clkinfo; /* * Construct clockinfo structure. */ bzero(&clkinfo, sizeof(clkinfo)); clkinfo.hz = hz; clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 0, 0, sysctl_kern_clockrate, "S,clockinfo", "Rate and period of various kernel clocks"); Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 116360) +++ head/sys/kern/kern_exec.c (revision 116361) @@ -1,1208 +1,1208 @@ /* * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int kern_execve(struct thread *td, char *fname, char **argv, char **envv, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); int ps_argsopen = 1; SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, ""); #ifdef __ia64__ /* XXX HACK */ static int regstkpages = 256; SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, ®stkpages, 0, ""); #endif static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings))); } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack))); } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. * * MPSAFE */ static int kern_execve(td, fname, argv, envv, mac_p) struct thread *td; char *fname; char **argv; char **envv; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL; int credential_changing; int textset; #ifdef MAC struct label interplabel; /* label of the interpreted vnode */ struct label execlabel; /* optional label argument */ int will_transition, interplabelvalid = 0; #endif imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); - if (p->p_flag & P_THREADED || p->p_numthreads > 1) { + if (p->p_flag & P_SA || p->p_numthreads > 1) { if (thread_single(SINGLE_EXIT)) { PROC_UNLOCK(p); return (ERESTART); /* Try again later. */ } /* * If we get here all other threads are dead, * so unset the associated flags and lose KSE mode. */ - p->p_flag &= ~P_THREADED; + p->p_flag &= ~P_SA; td->td_mailbox = NULL; thread_single_end(); } p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->userspace_argv = argv; imgp->userspace_envv = envv; imgp->execlabel = NULL; imgp->attr = &attr; imgp->argc = imgp->envc = 0; imgp->argv0 = NULL; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name[0] = '\0'; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; #ifdef MAC error = mac_execve_enter(imgp, mac_p, &execlabel); if (error) { mtx_lock(&Giant); goto exec_fail; } #endif /* * Allocate temporary demand zeroed space for argument and * environment strings */ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE); if (imgp->stringbase == NULL) { error = ENOMEM; mtx_lock(&Giant); goto exec_fail; } imgp->stringp = imgp->stringbase; imgp->stringspace = ARG_MAX; imgp->image_header = imgp->stringbase + ARG_MAX; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. */ ndp = &nd; NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_USERSPACE, fname, td); mtx_lock(&Giant); interpret: error = namei(ndp); if (error) { kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); goto exec_fail; } imgp->vp = ndp->ni_vp; imgp->fname = fname; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) imgp->vp->v_vflag &= ~VV_TEXT; error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); #ifdef MAC mac_init_vnode_label(&interplabel); mac_copy_vnode_label(&ndp->ni_vp->v_label, &interplabel); interplabelvalid = 1; #endif vput(ndp->ni_vp); vm_object_deallocate(imgp->object); imgp->object = NULL; /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ FILEDESC_LOCK(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; tmp = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_UNLOCK(p->p_fd); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->endargs - imgp->stringbase; if (ps_arg_cache_limit >= i + sizeof(struct pargs)) newargs = pargs_alloc(i); /* close files on exec */ fdcloseexec(td); /* Get a reference to the vnode prior to locking the proc */ VREF(ndp->ni_vp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup(p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ oldcred = p->p_ucred; credential_changing = 0; credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_execve_will_transition(oldcred, imgp->vp, interplabelvalid ? &interplabel : NULL, imgp); credential_changing |= will_transition; #endif if (credential_changing && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracevp != NULL && suser_cred(oldcred, PRISON_ROOT)) { mtx_lock(&ktrace_mtx); p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); } #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); setugidsafety(td); error = fdcheckstd(td); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ crcopy(newcred, oldcred); if (attr.va_mode & VSUID) change_euid(newcred, euip); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_execve_transition(oldcred, newcred, imgp->vp, interplabelvalid ? &interplabel : NULL, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { crcopy(newcred, oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. */ if (p->p_flag & P_TRACED) psignal(p, SIGTRAP); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* Free any previous argument cache */ oldargs = p->p_args; p->p_args = NULL; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { bcopy(imgp->stringbase, newargs->ar_args, i); p->p_args = newargs; newargs = NULL; } PROC_UNLOCK(p); /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); else exec_setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) vrele(textvp); if (ndp->ni_vp && error != 0) vrele(ndp->ni_vp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif if (oldargs != NULL) pargs_drop(oldargs); if (newargs != NULL) pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage) exec_unmap_first_page(imgp); if (imgp->vp) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(imgp->vp); } if (imgp->stringbase != NULL) kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX + PAGE_SIZE); if (imgp->object) vm_object_deallocate(imgp->object); if (error == 0) { /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); if (imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ #ifdef MAC mac_execve_exit(imgp); if (interplabelvalid) mac_destroy_vnode_label(&interplabel); #endif exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ error = 0; } done2: #ifdef MAC mac_execve_exit(imgp); if (interplabelvalid) mac_destroy_vnode_label(&interplabel); #endif mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif /* * MPSAFE */ int execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif /* * MPSAFE */ int __mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC return (kern_execve(td, uap->fname, uap->argv, uap->envv, uap->mac_p)); #else return (ENOSYS); #endif } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; GIANT_REQUIRED; if (imgp->firstpage) { exec_unmap_first_page(imgp); } VOP_GETVOBJECT(imgp->vp, &object); VM_OBJECT_LOCK(object); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); vm_page_lock_queues(); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { vm_page_unlock_queues(); initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { vm_page_lock_queues(); if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) { vm_page_unlock_queues(); break; } if (ma[i]->valid) { vm_page_unlock_queues(); break; } vm_page_busy(ma[i]); vm_page_unlock_queues(); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); vm_page_lock_queues(); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { pmap_remove_all(ma[0]); vm_page_free(ma[0]); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); return (EIO); } } VM_OBJECT_UNLOCK(object); vm_page_wire(ma[0]); vm_page_wakeup(ma[0]); vm_page_unlock_queues(); pmap_qenter((vm_offset_t)imgp->image_header, ma, 1); imgp->firstpage = ma[0]; return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { GIANT_REQUIRED; if (imgp->firstpage) { pmap_qremove((vm_offset_t)imgp->image_header, 1); vm_page_lock_queues(); vm_page_unwire(imgp->firstpage, 1); vm_page_unlock_queues(); imgp->firstpage = NULL; } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_offset_t stack_addr; vm_map_t map; GIANT_REQUIRED; stack_addr = sv->sv_usrstack - maxssiz; imgp->vmspace_destroyed = 1; EVENTHANDLER_INVOKE(process_exec, p); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); vm_page_lock_queues(); pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map), vm_map_max(map)); vm_page_unlock_queues(); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, sv->sv_stackprot, VM_PROT_ALL, 0); if (error) return (error); #ifdef __ia64__ { /* * Allocate backing store. We really need something * similar to vm_map_stack which can allow the backing * store to grow upwards. This will do for now. */ vm_offset_t bsaddr; bsaddr = p->p_sysent->sv_usrstack - 2 * maxssiz; error = vm_map_find(map, 0, 0, &bsaddr, regstkpages * PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0); FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr; } #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz; return (0); } /* * Copy out argument and environment strings from the old process * address space into the temporary string buffer. */ int exec_extract_strings(imgp) struct image_params *imgp; { char **argv, **envv; char *argp, *envp; int error; size_t length; /* * extract arguments first */ argv = imgp->userspace_argv; if (argv) { argp = (caddr_t)(intptr_t)fuword(argv); if (argp == (caddr_t)-1) return (EFAULT); if (argp) argv++; if (imgp->argv0) argp = imgp->argv0; if (argp) { do { if (argp == (caddr_t)-1) return (EFAULT); if ((error = copyinstr(argp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return (E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->argc++; } while ((argp = (caddr_t)(intptr_t)fuword(argv++))); } } imgp->endargs = imgp->stringp; /* * extract environment strings */ envv = imgp->userspace_envv; if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) return (EFAULT); if ((error = copyinstr(envp, imgp->stringp, imgp->stringspace, &length))) { if (error == ENAMETOOLONG) return (E2BIG); return (error); } imgp->stringspace -= length; imgp->stringp += length; imgp->envc++; } } return (0); } /* * Copy strings out to the new process address space, constructing * new arg and env vector tables. Return a pointer to the base * so that it can be used as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) * sizeof(char *)); /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->stringbase; argc = imgp->argc; envc = imgp->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* XXXKSE */ /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred, td); if (error) return (error); #ifdef MAC error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td); return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 116360) +++ head/sys/kern/kern_exit.c (revision 116361) @@ -1,790 +1,790 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include /* Required to be non-static for SysVR4 emulator */ MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); static int wait1(struct thread *, struct wait_args *, int); /* * exit -- * Death of process. * * MPSAFE */ void sys_exit(struct thread *td, struct sys_exit_args *uap) { mtx_lock(&Giant); exit1(td, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct proc *p, *nq, *q; struct tty *tp; struct vnode *ttyvp; struct vmspace *vm; struct vnode *vtmp; #ifdef KTRACE struct vnode *tracevp; struct ucred *tracecred; #endif GIANT_REQUIRED; p = td->td_proc; if (p == initproc) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); - if (p->p_flag & P_THREADED || p->p_numthreads > 1) { + if (p->p_flag & P_SA || p->p_numthreads > 1) { /* * First check if some other thread got here before us.. * if so, act apropriatly, (exit or suspend); */ thread_suspend_check(0); /* * Kill off the other threads. This requires * Some co-operation from other parts of the kernel * so it may not be instant. * With this state set: * Any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediatly * with EINTR or EWOULDBLOCK, which will hopefully force them * to back out to userland, freeing resources as they go, and * anything attempting to return to userland will thread_exit() * from userret(). thread_exit() will unsuspend us * when the last other thread exits. */ if (thread_single(SINGLE_EXIT)) { panic ("Exit: Single threading fouled up"); } /* * All other activity in this process is now stopped. * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them) * ... * Turn off threading support. */ - p->p_flag &= ~P_THREADED; + p->p_flag &= ~P_SA; thread_single_end(); /* Don't need this any more. */ } /* * With this state set: * Any thread entering the kernel from userspace will thread_exit() * in trap(). Any thread attempting to sleep will return immediatly * with EINTR or EWOULDBLOCK, which will hopefully force them * to back out to userland, freeing resources as they go, and * anything attempting to return to userland will thread_exit() * from userret(). thread_exit() will do a wakeup on p->p_numthreads * if it transitions to 1. */ p->p_flag |= P_WEXIT; PROC_UNLOCK(p); /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } #ifdef PGINPROF vmsizmon(); #endif STOPEVENT(p, S_EXIT, rv); wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */ /* * Check if any loadable modules need anything done at process exit. * e.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), M_ZOMBIE, M_WAITOK); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT); SIGEMPTYSET(p->p_siglist); SIGEMPTYSET(td->td_siglist); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * Close open files and release open-file table. * This may block! */ fdfree(td); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; /* * Release user portion of address space. * This releases references to vnodes, * which could cause I/O if the file has been unlinked. * Need to do this early enough that we can still sleep. * Can't free the entire vmspace as the kernel stack * may be mapped within that space also. * * Processes sharing the same vmspace may exit in one order, and * get cleaned up by vmspace_exit() in a different order. The * last exiting process to reach this point releases as much of * the environment as it can, and the last process cleaned up * by vmspace_exit() (which decrements exitingcnt) cleans up the * remainder. */ ++vm->vm_exitingcnt; if (--vm->vm_refcnt == 0) { shmexit(vm); vm_page_lock_queues(); pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); vm_page_unlock_queues(); (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), vm_map_max(&vm->vm_map)); } sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp; sp = p->p_session; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { tp = sp->s_ttyp; if (sp->s_ttyp->t_pgrp) { PGRP_LOCK(sp->s_ttyp->t_pgrp); pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); PGRP_UNLOCK(sp->s_ttyp->t_pgrp); } /* XXX tp should be locked. */ sx_xunlock(&proctree_lock); (void) ttywait(tp); sx_xlock(&proctree_lock); /* * The tty could have been revoked * if we blocked. */ if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); sx_xunlock(&proctree_lock); VOP_REVOKE(ttyvp, REVOKEALL); vrele(ttyvp); sx_xlock(&proctree_lock); } } if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); vrele(ttyvp); } /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } SESS_LOCK(p->p_session); sp->s_leader = NULL; SESS_UNLOCK(p->p_session); } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); #ifdef KTRACE /* * release trace file */ PROC_LOCK(p); mtx_lock(&ktrace_mtx); p->p_traceflag = 0; /* don't trace the vrele() */ tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif /* * Release reference to text vnode */ if ((vtmp = p->p_textvp) != NULL) { p->p_textvp = NULL; vrele(vtmp); } /* * Release our limits structure. */ mtx_assert(&Giant, MA_OWNED); if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); p->p_limit = NULL; } /* * Release this thread's reference to the ucred. The actual proc * reference will stay around until the proc is harvested by * wait(). At this point the ucred is immutable (no other threads * from this proc are around that can change it) so we leave the * per-thread ucred pointer intact in case it is needed although * in theory nothing should be using it at this point. */ crfree(td->td_ucred); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(initproc); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); proc_reparent(q, initproc); q->p_sigparent = SIGCHLD; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { q->p_flag &= ~P_TRACED; psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Save exit status and final rusage info, adding in child rusage * info and self times. */ PROC_LOCK(p); p->p_xstat = rv; *p->p_ru = p->p_stats->p_ru; mtx_lock_spin(&sched_lock); calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); mtx_unlock_spin(&sched_lock); ruadd(p->p_ru, &p->p_stats->p_cru); /* * Notify interested parties of our demise. */ KNOTE(&p->p_klist, NOTE_EXIT); /* * Notify parent that we're gone. If parent has the PS_NOCLDWAIT * flag set, or if the handler is set to SIG_IGN, notify process * 1 instead (and hope it will handle this situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, initproc); PROC_LOCK(p->p_pptr); /* * If this was the last child of our parent, notify * parent, so in case he was wait(2)ing, he will * continue. */ if (LIST_EMPTY(&pp->p_children)) wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_sigparent && p->p_pptr != initproc) psignal(p->p_pptr, p->p_sigparent); else psignal(p->p_pptr, SIGCHLD); PROC_UNLOCK(p->p_pptr); /* * If this is a kthread, then wakeup anyone waiting for it to exit. */ if (p->p_flag & P_KTHREAD) wakeup(p); PROC_UNLOCK(p); /* * Finally, call machine-dependent code to release the remaining * resources including address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); PROC_LOCK(p); PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); mtx_lock_spin(&sched_lock); while (mtx_owned(&Giant)) mtx_unlock(&Giant); /* * We have to wait until after acquiring all locks before * changing p_state. If we block on a mutex then we will be * back at SRUN when we resume and our parent will never * harvest us. */ p->p_state = PRS_ZOMBIE; wakeup(p->p_pptr); PROC_UNLOCK(p->p_pptr); cnt.v_swtch++; binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */ /* * Allow the scheduler to adjust the priority of the * parent when a kseg is exiting. */ if (p->p_pid != 1) sched_exit(p->p_pptr, p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifdef COMPAT_43 /* * MPSAFE. The dirty work is handled by wait1(). */ int owait(struct thread *td, struct owait_args *uap __unused) { struct wait_args w; w.options = 0; w.rusage = NULL; w.pid = WAIT_ANY; w.status = NULL; return (wait1(td, &w, 1)); } #endif /* COMPAT_43 */ /* * MPSAFE. The dirty work is handled by wait1(). */ int wait4(struct thread *td, struct wait_args *uap) { return (wait1(td, uap, 0)); } /* * MPSAFE */ static int wait1(struct thread *td, struct wait_args *uap, int compat) { struct rusage ru; int nfound; struct proc *p, *q, *t; int status, error; q = td->td_proc; if (uap->pid == 0) { PROC_LOCK(q); uap->pid = -q->p_pgid; PROC_UNLOCK(q); } if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE)) return (EINVAL); mtx_lock(&Giant); loop: nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { PROC_LOCK(p); if (uap->pid != WAIT_ANY && p->p_pid != uap->pid && p->p_pgid != -uap->pid) { PROC_UNLOCK(p); continue; } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((uap->options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); continue; } nfound++; if (p->p_state == PRS_ZOMBIE) { td->td_retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) td->td_retval[1] = p->p_xstat; else #endif if (uap->status) { status = p->p_xstat; /* convert to int */ PROC_UNLOCK(p); if ((error = copyout(&status, uap->status, sizeof(status)))) { sx_xunlock(&proctree_lock); mtx_unlock(&Giant); return (error); } PROC_LOCK(p); } if (uap->rusage) { bcopy(p->p_ru, &ru, sizeof(ru)); PROC_UNLOCK(p); if ((error = copyout(&ru, uap->rusage, sizeof (struct rusage)))) { sx_xunlock(&proctree_lock); mtx_unlock(&Giant); return (error); } } else PROC_UNLOCK(p); /* * If we got the child via a ptrace 'attach', * we need to give it back to the old parent. */ if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { PROC_LOCK(p); p->p_oppid = 0; proc_reparent(p, t); PROC_UNLOCK(p); psignal(t, SIGCHLD); wakeup(t); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); mtx_unlock(&Giant); return (0); } /* * Remove other references to this process to ensure * we have an exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); leavepgrp(p); sx_xunlock(&proctree_lock); /* * As a side effect of this lock, we know that * all other writes to this proc are visible now, so * no more locking is needed for p. */ PROC_LOCK(p); p->p_xstat = 0; /* XXX: why? */ PROC_UNLOCK(p); PROC_LOCK(q); ruadd(&q->p_stats->p_cru, p->p_ru); PROC_UNLOCK(q); FREE(p->p_ru, M_ZOMBIE); p->p_ru = NULL; /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Free credentials, arguments, and sigacts */ crfree(p->p_ucred); p->p_ucred = NULL; pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * do any thread-system specific cleanups */ thread_wait(p); /* * Give vm and machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_destroy_proc(p); #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("wait1: no residual thread!")); uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; sx_xunlock(&allproc_lock); mtx_unlock(&Giant); return (0); } mtx_lock_spin(&sched_lock); if (P_SHOULDSTOP(p) && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0) && (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { mtx_unlock_spin(&sched_lock); p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; #ifdef COMPAT_43 if (compat) { td->td_retval[1] = W_STOPCODE(p->p_xstat); PROC_UNLOCK(p); error = 0; } else #endif if (uap->status) { status = W_STOPCODE(p->p_xstat); PROC_UNLOCK(p); error = copyout(&status, uap->status, sizeof(status)); } else { PROC_UNLOCK(p); error = 0; } mtx_unlock(&Giant); return (error); } mtx_unlock_spin(&sched_lock); if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; p->p_flag &= ~P_CONTINUED; PROC_UNLOCK(p); if (uap->status) { status = SIGCONT; error = copyout(&status, uap->status, sizeof(status)); } else error = 0; mtx_unlock(&Giant); return (error); } PROC_UNLOCK(p); } if (nfound == 0) { sx_xunlock(&proctree_lock); mtx_unlock(&Giant); return (ECHILD); } if (uap->options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; mtx_unlock(&Giant); return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) { mtx_unlock(&Giant); return (error); } goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; } Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 116360) +++ head/sys/kern/kern_fork.c (revision 116361) @@ -1,829 +1,829 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif static int forksleep; /* Place for fork1() to sleep on. */ /* * MPSAFE */ /* ARGSUSED */ int fork(td, uap) struct thread *td; struct fork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return error; } /* * MPSAFE */ /* ARGSUSED */ int vfork(td, uap) struct thread *td; struct vfork_args *uap; { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return error; } /* * MPSAFE */ int rfork(td, uap) struct thread *td; struct rfork_args *uap; { int error; struct proc *p2; /* Don't allow kernel only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); error = fork1(td, uap->flags, 0, &p2); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } return error; } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; sysctl_wire_old_buffer(req, sizeof(int)); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > PID_MAX - 100) /* out of range */ pid = PID_MAX - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); int fork1(td, flags, pages, procp) struct thread *td; /* parent proc */ int flags; int pages; struct proc **procp; /* child proc */ { struct proc *p2, *pptr; uid_t uid; struct proc *newproc; int trypid; int ok; static int pidchecked = 0; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct proc *p1 = td->td_proc; struct thread *td2; struct kse *ke2; struct ksegrp *kg2; struct sigacts *newsigacts; int error; /* Can't copy and clear */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); mtx_lock(&Giant); /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { vm_forkproc(td, NULL, NULL, flags); /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd); fdfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent.) */ if (flags & RFFDG) { FILEDESC_LOCK(p1->p_fd); if (p1->p_fd->fd_refcnt > 1) { struct filedesc *newfd; newfd = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p1->p_fd); fdfree(td); p1->p_fd = newfd; } else FILEDESC_UNLOCK(p1->p_fd); } mtx_unlock(&Giant); *procp = NULL; return (0); } /* * Note 1:1 allows for forking with one thread coming out on the * other side with the expectation that the process is about to * exec. */ - if (p1->p_flag & P_THREADED) { + if (p1->p_flag & P_SA) { /* * Idle the other threads for a second. * Since the user space is copied, it must remain stable. * In addition, all threads (from the user perspective) * need to either be suspended or in the kernel, * where they will try restart in the parent and will * be aborted in the child. */ PROC_LOCK(p1); if (thread_single(SINGLE_NO_EXIT)) { /* Abort.. someone else is single threading before us */ PROC_UNLOCK(p1); mtx_unlock(&Giant); return (ERESTART); } PROC_UNLOCK(p1); /* * All other activity in this process * is now suspended at the user boundary, * (or other safe places if we think of any). */ } /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); #ifdef MAC mac_init_proc(newproc); #endif /* * Although process entries are dynamically created, we still keep * a global limit on the maximum number we will create. Don't allow * a nonprivileged user to use the last ten processes; don't let root * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ sx_xlock(&allproc_lock); uid = td->td_ucred->cr_ruid; if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { error = EAGAIN; goto fail; } /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. */ PROC_LOCK(p1); ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); PROC_UNLOCK(p1); if (!ok) { error = EAGAIN; goto fail; } /* * Increment the nprocs resource before blocking can occur. There * are hard-limits as to the number of processes that can run. */ nprocs++; /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) { trypid = 10; } } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= PID_MAX) { trypid = trypid % PID_MAX; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { PROC_LOCK(p2); while (p2->p_pid == trypid || p2->p_pgrp->pg_id == trypid || p2->p_session->s_sid == trypid) { trypid++; if (trypid >= pidchecked) { PROC_UNLOCK(p2); goto retry; } } if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; if (p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; PROC_UNLOCK(p2); } if (!doingzomb) { doingzomb = 1; p2 = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; p2 = newproc; p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); sx_xunlock(&allproc_lock); /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (flags & RFCFDG) { fd = fdinit(td->td_proc->p_fd); fdtol = NULL; } else if (flags & RFFDG) { FILEDESC_LOCK(p1->p_fd); fd = fdcopy(td->td_proc->p_fd); FILEDESC_UNLOCK(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((flags & RFTHREAD) != 0) { /* * Shared file descriptor table and * shared process leaders. */ fdtol = p1->p_fdtol; FILEDESC_LOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_UNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and * different process leaders */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ td2 = FIRST_THREAD_IN_PROC(p2); kg2 = FIRST_KSEGRP_IN_PROC(p2); ke2 = FIRST_KSE_IN_KSEGRP(kg2); /* Allocate and switch to an alternate kstack if specified */ if (pages != 0) vm_thread_new_altkstack(td2, pages); PROC_LOCK(p2); PROC_LOCK(p1); #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) bzero(&p2->p_startzero, (unsigned) RANGEOF(struct proc, p_startzero, p_endzero)); bzero(&ke2->ke_startzero, (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero)); bzero(&td2->td_startzero, (unsigned) RANGEOF(struct thread, td_startzero, td_endzero)); bzero(&kg2->kg_startzero, (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); bcopy(&p1->p_startcopy, &p2->p_startcopy, (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy)); bcopy(&td->td_startcopy, &td2->td_startcopy, (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy, (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); #undef RANGEOF /* Set up the thread as an active thread (as if runnable). */ ke2->ke_state = KES_THREAD; ke2->ke_thread = td2; td2->td_kse = ke2; /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats substruct is set in vm_forkproc. */ p2->p_flag = 0; if (p1->p_flag & P_PROFIL) startprofclock(p2); mtx_lock_spin(&sched_lock); p2->p_sflag = PS_INMEM; /* * Allow the scheduler to adjust the priority of the child and * parent while we hold the sched_lock. */ sched_fork(p1, p2); mtx_unlock_spin(&sched_lock); p2->p_ucred = crhold(td->td_ucred); td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */ pargs_hold(p2->p_args); if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; /* Bump references to the text vnode (for procfs) */ p2->p_textvp = p1->p_textvp; if (p2->p_textvp) VREF(p2->p_textvp); p2->p_fd = fd; p2->p_fdtol = fdtol; PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * p_limit is copy-on-write, bump refcnt, */ p2->p_limit = p1->p_limit; p2->p_limit->p_refcnt++; /* * Setup linkage for kernel based threading */ if((flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK); SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); callout_init(&p2->p_itcallout, 1); #ifdef KTRACE /* * Copy traceflag and tracefile if enabled. */ mtx_lock(&ktrace_mtx); KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); if (p1->p_traceflag & KTRFAC_INHERIT) { p2->p_traceflag = p1->p_traceflag; if ((p2->p_tracevp = p1->p_tracevp) != NULL) { VREF(p2->p_tracevp); KASSERT(p1->p_tracecred != NULL, ("ktrace vnode with no cred")); p2->p_tracecred = crhold(p1->p_tracecred); } } mtx_unlock(&ktrace_mtx); #endif /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if (flags & RFNOWAIT) pptr = initproc; else pptr = p1; p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, flags); if (flags == (RFFDG | RFPROC)) { cnt.v_forks++; cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { cnt.v_vforks++; cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else if (p1 == &proc0) { cnt.v_kthreads++; cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } else { cnt.v_rforks++; cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize; } /* * Both processes are set up, now check if any loadable modules want * to adjust anything. * What if they have an error? XXX */ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); /* * If RFSTOPPED not requested, make child runnable and add to * run queue. */ microuptime(&p2->p_stats->p_start); if ((flags & RFSTOPPED) == 0) { mtx_lock_spin(&sched_lock); p2->p_state = PRS_NORMAL; TD_SET_CAN_RUN(td2); setrunqueue(td2); mtx_unlock_spin(&sched_lock); } /* * Now can be swapped. */ PROC_LOCK(p1); _PRELE(p1); /* * tell any interested parties about the new process */ KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); PROC_UNLOCK(p1); /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, set P_PPWAIT on child, and sleep on our * proc (in case of exit). */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); PROC_UNLOCK(p2); /* * If other threads are waiting, let them continue now */ - if (p1->p_flag & P_THREADED) { + if (p1->p_flag & P_SA) { PROC_LOCK(p1); thread_single_end(); PROC_UNLOCK(p1); } /* * Return child proc pointer to parent. */ mtx_unlock(&Giant); *procp = p2; return (0); fail: sx_xunlock(&allproc_lock); uma_zfree(proc_zone, newproc); - if (p1->p_flag & P_THREADED) { + if (p1->p_flag & P_SA) { PROC_LOCK(p1); thread_single_end(); PROC_UNLOCK(p1); } tsleep(&forksleep, PUSER, "fork", hz / 2); mtx_unlock(&Giant); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(callout, arg, frame) void (*callout)(void *, struct trapframe *); void *arg; struct trapframe *frame; { struct thread *td; struct proc *p; if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } td = curthread; p = td->td_proc; td->td_oncpu = PCPU_GET(cpuid); p->p_state = PRS_NORMAL; /* * Finish setting up thread glue. We need to initialize * the thread into a td_critnest=1 state. Some platforms * may have already partially or fully initialized td_critnest * and/or td_md.md_savecrit (when applciable). * * see //critical.c */ sched_lock.mtx_lock = (uintptr_t)td; sched_lock.mtx_recurse = 0; cpu_critical_fork_exit(); CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); if (PCPU_GET(switchtime.sec) == 0) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); mtx_unlock_spin(&sched_lock); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ PROC_LOCK(p); if (p->p_flag & P_KTHREAD) { PROC_UNLOCK(p); mtx_lock(&Giant); printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", p->p_comm, p->p_pid); kthread_exit(0); } PROC_UNLOCK(p); #ifdef DIAGNOSTIC cred_free_thread(td); #endif mtx_assert(&Giant, MA_NOTOWNED); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(td, frame) struct thread *td; struct trapframe *frame; { userret(td, frame, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/kern_kse.c =================================================================== --- head/sys/kern/kern_kse.c (revision 116360) +++ head/sys/kern/kern_kse.c (revision 116361) @@ -1,2022 +1,2022 @@ /* * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * KSEGRP related storage. */ static uma_zone_t ksegrp_zone; static uma_zone_t kse_zone; static uma_zone_t thread_zone; static uma_zone_t upcall_zone; /* DEBUG ONLY */ SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation"); static int thread_debug = 0; SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW, &thread_debug, 0, "thread debug"); static int max_threads_per_proc = 150; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW, &max_threads_per_proc, 0, "Limit on threads per proc"); static int max_groups_per_proc = 50; SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW, &max_groups_per_proc, 0, "Limit on thread groups per proc"); static int max_threads_hits; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD, &max_threads_hits, 0, ""); static int virtual_cpu; #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses); TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps); TAILQ_HEAD(, kse_upcall) zombie_upcalls = TAILQ_HEAD_INITIALIZER(zombie_upcalls); struct mtx kse_zombie_lock; MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN); static void kse_purge(struct proc *p, struct thread *td); static void kse_purge_group(struct thread *td); static int thread_update_usr_ticks(struct thread *td, int user); static void thread_alloc_spare(struct thread *td, struct thread *spare); static int sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS) { int error, new_val; int def_val; #ifdef SMP def_val = mp_ncpus; #else def_val = 1; #endif if (virtual_cpu == 0) new_val = def_val; else new_val = virtual_cpu; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < 0) return (EINVAL); virtual_cpu = new_val; return (0); } /* DEBUG ONLY */ SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I", "debug virtual cpus"); /* * Prepare a thread for use. */ static void thread_ctor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif } /* * Initialize type-stable parts of a thread (when newly created). */ static void thread_init(void *mem, int size) { struct thread *td; td = (struct thread *)mem; mtx_lock(&Giant); vm_thread_new(td, 0); mtx_unlock(&Giant); cpu_thread_setup(td); td->td_sched = (struct td_sched *)&td[1]; } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; vm_thread_dispose(td); } /* * Initialize type-stable parts of a kse (when newly created). */ static void kse_init(void *mem, int size) { struct kse *ke; ke = (struct kse *)mem; ke->ke_sched = (struct ke_sched *)&ke[1]; } /* * Initialize type-stable parts of a ksegrp (when newly created). */ static void ksegrp_init(void *mem, int size) { struct ksegrp *kg; kg = (struct ksegrp *)mem; kg->kg_sched = (struct kg_sched *)&kg[1]; } /* * KSE is linked into kse group. */ void kse_link(struct kse *ke, struct ksegrp *kg) { struct proc *p = kg->kg_proc; TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist); kg->kg_kses++; ke->ke_state = KES_UNQUEUED; ke->ke_proc = p; ke->ke_ksegrp = kg; ke->ke_thread = NULL; ke->ke_oncpu = NOCPU; ke->ke_flags = 0; } void kse_unlink(struct kse *ke) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = ke->ke_ksegrp; TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist); if (ke->ke_state == KES_IDLE) { TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses--; } if (--kg->kg_kses == 0) ksegrp_unlink(kg); /* * Aggregate stats from the KSE */ kse_stash(ke); } void ksegrp_link(struct ksegrp *kg, struct proc *p) { TAILQ_INIT(&kg->kg_threads); TAILQ_INIT(&kg->kg_runq); /* links with td_runq */ TAILQ_INIT(&kg->kg_slpq); /* links with td_runq */ TAILQ_INIT(&kg->kg_kseq); /* all kses in ksegrp */ TAILQ_INIT(&kg->kg_iq); /* all idle kses in ksegrp */ TAILQ_INIT(&kg->kg_upcalls); /* all upcall structure in ksegrp */ kg->kg_proc = p; /* * the following counters are in the -zero- section * and may not need clearing */ kg->kg_numthreads = 0; kg->kg_runnable = 0; kg->kg_kses = 0; kg->kg_runq_kses = 0; /* XXXKSE change name */ kg->kg_idle_kses = 0; kg->kg_numupcalls = 0; /* link it in now that it's consistent */ p->p_numksegrps++; TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp); } void ksegrp_unlink(struct ksegrp *kg) { struct proc *p; mtx_assert(&sched_lock, MA_OWNED); KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads")); KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses")); KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls")); p = kg->kg_proc; TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp); p->p_numksegrps--; /* * Aggregate stats from the KSE */ ksegrp_stash(kg); } struct kse_upcall * upcall_alloc(void) { struct kse_upcall *ku; ku = uma_zalloc(upcall_zone, M_WAITOK); bzero(ku, sizeof(*ku)); return (ku); } void upcall_free(struct kse_upcall *ku) { uma_zfree(upcall_zone, ku); } void upcall_link(struct kse_upcall *ku, struct ksegrp *kg) { mtx_assert(&sched_lock, MA_OWNED); TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link); ku->ku_ksegrp = kg; kg->kg_numupcalls++; } void upcall_unlink(struct kse_upcall *ku) { struct ksegrp *kg = ku->ku_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__)); TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link); kg->kg_numupcalls--; upcall_stash(ku); } void upcall_remove(struct thread *td) { if (td->td_upcall) { td->td_upcall->ku_owner = NULL; upcall_unlink(td->td_upcall); td->td_upcall = 0; } } /* * For a newly created process, * link up all the structures and its initial threads etc. */ void proc_linkup(struct proc *p, struct ksegrp *kg, struct kse *ke, struct thread *td) { TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */ TAILQ_INIT(&p->p_threads); /* all threads in proc */ TAILQ_INIT(&p->p_suspended); /* Threads suspended */ p->p_numksegrps = 0; p->p_numthreads = 0; ksegrp_link(kg, p); kse_link(ke, kg); thread_link(td, kg); } /* struct kse_thr_interrupt_args { struct kse_thr_mailbox * tmbx; }; */ int kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap) { struct proc *p; struct thread *td2; p = td->td_proc; - if (!(p->p_flag & P_THREADED) || (uap->tmbx == NULL)) + if (!(p->p_flag & P_SA) || (uap->tmbx == NULL)) return (EINVAL); mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_mailbox == uap->tmbx) { td2->td_flags |= TDF_INTERRUPT; if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_CVWAITQ) cv_abort(td2); else abortsleep(td2); } mtx_unlock_spin(&sched_lock); return (0); } } mtx_unlock_spin(&sched_lock); return (ESRCH); } /* struct kse_exit_args { register_t dummy; }; */ int kse_exit(struct thread *td, struct kse_exit_args *uap) { struct proc *p; struct ksegrp *kg; struct kse *ke; struct kse_upcall *ku, *ku2; int error, count; p = td->td_proc; if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) return (EINVAL); kg = td->td_ksegrp; count = 0; PROC_LOCK(p); mtx_lock_spin(&sched_lock); FOREACH_UPCALL_IN_GROUP(kg, ku2) { if (ku2->ku_flags & KUF_EXITING) count++; } if ((kg->kg_numupcalls - count) == 1 && (kg->kg_numthreads > 1)) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (EDEADLK); } ku->ku_flags |= KUF_EXITING; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); error = suword(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE); PROC_LOCK(p); if (error) psignal(p, SIGSEGV); mtx_lock_spin(&sched_lock); upcall_remove(td); ke = td->td_kse; if (p->p_numthreads == 1) { kse_purge(p, td); - p->p_flag &= ~P_THREADED; + p->p_flag &= ~P_SA; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); } else { if (kg->kg_numthreads == 1) { /* Shutdown a group */ kse_purge_group(td); ke->ke_flags |= KEF_EXIT; } thread_stopped(p); thread_exit(); /* NOTREACHED */ } return (0); } /* * Either becomes an upcall or waits for an awakening event and * then becomes an upcall. Only error cases return. */ /* struct kse_release_args { struct timespec *timeout; }; */ int kse_release(struct thread *td, struct kse_release_args *uap) { struct proc *p; struct ksegrp *kg; struct timespec ts, ts2, ts3, timeout; struct timeval tv; int error; p = td->td_proc; kg = td->td_ksegrp; if (td->td_upcall == NULL || TD_CAN_UNBIND(td)) return (EINVAL); if (uap->timeout != NULL) { if ((error = copyin(uap->timeout, &timeout, sizeof(timeout)))) return (error); getnanouptime(&ts); timespecadd(&ts, &timeout); TIMESPEC_TO_TIMEVAL(&tv, &timeout); } mtx_lock_spin(&sched_lock); /* Change OURSELF to become an upcall. */ td->td_flags = TDF_UPCALLING; #if 0 /* XXX This shouldn't be necessary */ if (p->p_sflag & PS_NEEDSIGCHK) td->td_flags |= TDF_ASTPENDING; #endif mtx_unlock_spin(&sched_lock); PROC_LOCK(p); while ((td->td_upcall->ku_flags & KUF_DOUPCALL) == 0 && (kg->kg_completed == NULL)) { kg->kg_upsleeps++; error = msleep(&kg->kg_completed, &p->p_mtx, PPAUSE|PCATCH, "kse_rel", (uap->timeout ? tvtohz(&tv) : 0)); kg->kg_upsleeps--; PROC_UNLOCK(p); if (uap->timeout == NULL || error != EWOULDBLOCK) return (0); getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) return (0); ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); PROC_LOCK(p); } PROC_UNLOCK(p); return (0); } /* struct kse_wakeup_args { struct kse_mailbox *mbx; }; */ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap) { struct proc *p; struct ksegrp *kg; struct kse_upcall *ku; struct thread *td2; p = td->td_proc; td2 = NULL; ku = NULL; /* KSE-enabled processes only, please. */ - if (!(p->p_flag & P_THREADED)) + if (!(p->p_flag & P_SA)) return (EINVAL); PROC_LOCK(p); mtx_lock_spin(&sched_lock); if (uap->mbx) { FOREACH_KSEGRP_IN_PROC(p, kg) { FOREACH_UPCALL_IN_GROUP(kg, ku) { if (ku->ku_mailbox == uap->mbx) break; } if (ku) break; } } else { kg = td->td_ksegrp; if (kg->kg_upsleeps) { wakeup_one(&kg->kg_completed); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (0); } ku = TAILQ_FIRST(&kg->kg_upcalls); } if (ku) { if ((td2 = ku->ku_owner) == NULL) { panic("%s: no owner", __func__); } else if (TD_ON_SLEEPQ(td2) && (td2->td_wchan == &kg->kg_completed)) { abortsleep(td2); } else { ku->ku_flags |= KUF_DOUPCALL; } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (0); } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (ESRCH); } /* * No new KSEG: first call: use current KSE, don't schedule an upcall * All other situations, do allocate max new KSEs and schedule an upcall. */ /* struct kse_create_args { struct kse_mailbox *mbx; int newgroup; }; */ int kse_create(struct thread *td, struct kse_create_args *uap) { struct kse *newke; struct ksegrp *newkg; struct ksegrp *kg; struct proc *p; struct kse_mailbox mbx; struct kse_upcall *newku; int err, ncpus; p = td->td_proc; if ((err = copyin(uap->mbx, &mbx, sizeof(mbx)))) return (err); /* Too bad, why hasn't kernel always a cpu counter !? */ #ifdef SMP ncpus = mp_ncpus; #else ncpus = 1; #endif if (thread_debug && virtual_cpu != 0) ncpus = virtual_cpu; /* Easier to just set it than to test and set */ PROC_LOCK(p); - p->p_flag |= P_THREADED; + p->p_flag |= P_SA; PROC_UNLOCK(p); kg = td->td_ksegrp; if (uap->newgroup) { /* Have race condition but it is cheap */ if (p->p_numksegrps >= max_groups_per_proc) return (EPROCLIM); /* * If we want a new KSEGRP it doesn't matter whether * we have already fired up KSE mode before or not. * We put the process in KSE mode and create a new KSEGRP. */ newkg = ksegrp_alloc(); bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); bcopy(&kg->kg_startcopy, &newkg->kg_startcopy, RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); mtx_lock_spin(&sched_lock); if (p->p_numksegrps >= max_groups_per_proc) { mtx_unlock_spin(&sched_lock); ksegrp_free(newkg); return (EPROCLIM); } ksegrp_link(newkg, p); mtx_unlock_spin(&sched_lock); } else { newkg = kg; } /* * Creating upcalls more than number of physical cpu does * not help performance. */ if (newkg->kg_numupcalls >= ncpus) return (EPROCLIM); if (newkg->kg_numupcalls == 0) { /* * Initialize KSE group, optimized for MP. * Create KSEs as many as physical cpus, this increases * concurrent even if userland is not MP safe and can only run * on single CPU (for early version of libpthread, it is true). * In ideal world, every physical cpu should execute a thread. * If there is enough KSEs, threads in kernel can be * executed parallel on different cpus with full speed, * Concurrent in kernel shouldn't be restricted by number of * upcalls userland provides. * Adding more upcall structures only increases concurrent * in userland. * Highest performance configuration is: * N kses = N upcalls = N phyiscal cpus */ while (newkg->kg_kses < ncpus) { newke = kse_alloc(); bzero(&newke->ke_startzero, RANGEOF(struct kse, ke_startzero, ke_endzero)); #if 0 mtx_lock_spin(&sched_lock); bcopy(&ke->ke_startcopy, &newke->ke_startcopy, RANGEOF(struct kse, ke_startcopy, ke_endcopy)); mtx_unlock_spin(&sched_lock); #endif mtx_lock_spin(&sched_lock); kse_link(newke, newkg); /* Add engine */ kse_reassign(newke); mtx_unlock_spin(&sched_lock); } } newku = upcall_alloc(); newku->ku_mailbox = uap->mbx; newku->ku_func = mbx.km_func; bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t)); /* For the first call this may not have been set */ if (td->td_standin == NULL) thread_alloc_spare(td, NULL); mtx_lock_spin(&sched_lock); if (newkg->kg_numupcalls >= ncpus) { mtx_unlock_spin(&sched_lock); upcall_free(newku); return (EPROCLIM); } upcall_link(newku, newkg); if (mbx.km_quantum) newkg->kg_upquantum = max(1, mbx.km_quantum/tick); /* * Each upcall structure has an owner thread, find which * one owns it. */ if (uap->newgroup) { /* * Because new ksegrp hasn't thread, * create an initial upcall thread to own it. */ thread_schedule_upcall(td, newku); } else { /* * If current thread hasn't an upcall structure, * just assign the upcall to it. */ if (td->td_upcall == NULL) { newku->ku_owner = td; td->td_upcall = newku; } else { /* * Create a new upcall thread to own it. */ thread_schedule_upcall(td, newku); } } mtx_unlock_spin(&sched_lock); return (0); } /* * Initialize global thread allocation resources. */ void threadinit(void) { thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, UMA_ALIGN_CACHE, 0); ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(), NULL, NULL, ksegrp_init, NULL, UMA_ALIGN_CACHE, 0); kse_zone = uma_zcreate("KSE", sched_sizeof_kse(), NULL, NULL, kse_init, NULL, UMA_ALIGN_CACHE, 0); upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); } /* * Stash an embarasingly extra thread into the zombie thread queue. */ void thread_stash(struct thread *td) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra kse into the zombie kse queue. */ void kse_stash(struct kse *ke) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra upcall into the zombie upcall queue. */ void upcall_stash(struct kse_upcall *ku) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra ksegrp into the zombie ksegrp queue. */ void ksegrp_stash(struct ksegrp *kg) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp); mtx_unlock_spin(&kse_zombie_lock); } /* * Reap zombie kse resource. */ void thread_reap(void) { struct thread *td_first, *td_next; struct kse *ke_first, *ke_next; struct ksegrp *kg_first, * kg_next; struct kse_upcall *ku_first, *ku_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant.. */ if ((!TAILQ_EMPTY(&zombie_threads)) || (!TAILQ_EMPTY(&zombie_kses)) || (!TAILQ_EMPTY(&zombie_ksegrps)) || (!TAILQ_EMPTY(&zombie_upcalls))) { mtx_lock_spin(&kse_zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); ke_first = TAILQ_FIRST(&zombie_kses); kg_first = TAILQ_FIRST(&zombie_ksegrps); ku_first = TAILQ_FIRST(&zombie_upcalls); if (td_first) TAILQ_INIT(&zombie_threads); if (ke_first) TAILQ_INIT(&zombie_kses); if (kg_first) TAILQ_INIT(&zombie_ksegrps); if (ku_first) TAILQ_INIT(&zombie_upcalls); mtx_unlock_spin(&kse_zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_runq); if (td_first->td_ucred) crfree(td_first->td_ucred); thread_free(td_first); td_first = td_next; } while (ke_first) { ke_next = TAILQ_NEXT(ke_first, ke_procq); kse_free(ke_first); ke_first = ke_next; } while (kg_first) { kg_next = TAILQ_NEXT(kg_first, kg_ksegrp); ksegrp_free(kg_first); kg_first = kg_next; } while (ku_first) { ku_next = TAILQ_NEXT(ku_first, ku_link); upcall_free(ku_first); ku_first = ku_next; } } } /* * Allocate a ksegrp. */ struct ksegrp * ksegrp_alloc(void) { return (uma_zalloc(ksegrp_zone, M_WAITOK)); } /* * Allocate a kse. */ struct kse * kse_alloc(void) { return (uma_zalloc(kse_zone, M_WAITOK)); } /* * Allocate a thread. */ struct thread * thread_alloc(void) { thread_reap(); /* check if any zombies to get */ return (uma_zalloc(thread_zone, M_WAITOK)); } /* * Deallocate a ksegrp. */ void ksegrp_free(struct ksegrp *td) { uma_zfree(ksegrp_zone, td); } /* * Deallocate a kse. */ void kse_free(struct kse *td) { uma_zfree(kse_zone, td); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { cpu_thread_clean(td); uma_zfree(thread_zone, td); } /* * Store the thread context in the UTS's mailbox. * then add the mailbox at the head of a list we are building in user space. * The list is anchored in the ksegrp structure. */ int thread_export_context(struct thread *td) { struct proc *p; struct ksegrp *kg; uintptr_t mbx; void *addr; int error,temp; mcontext_t mc; p = td->td_proc; kg = td->td_ksegrp; /* Export the user/machine context. */ get_mcontext(td, &mc, 0); addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext); error = copyout(&mc, addr, sizeof(mcontext_t)); if (error) goto bad; /* Exports clock ticks in kernel mode */ addr = (caddr_t)(&td->td_mailbox->tm_sticks); temp = fuword(addr) + td->td_usticks; if (suword(addr, temp)) { error = EFAULT; goto bad; } /* Get address in latest mbox of list pointer */ addr = (void *)(&td->td_mailbox->tm_next); /* * Put the saved address of the previous first * entry into this one */ for (;;) { mbx = (uintptr_t)kg->kg_completed; if (suword(addr, mbx)) { error = EFAULT; goto bad; } PROC_LOCK(p); if (mbx == (uintptr_t)kg->kg_completed) { kg->kg_completed = td->td_mailbox; /* * The thread context may be taken away by * other upcall threads when we unlock * process lock. it's no longer valid to * use it again in any other places. */ td->td_mailbox = NULL; PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } td->td_usticks = 0; return (0); bad: PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); /* The mailbox is bad, don't use it */ td->td_mailbox = NULL; td->td_usticks = 0; return (error); } /* * Take the list of completed mailboxes for this KSEGRP and put them on this * upcall's mailbox as it's the next one going up. */ static int thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku) { struct proc *p = kg->kg_proc; void *addr; uintptr_t mbx; addr = (void *)(&ku->ku_mailbox->km_completed); for (;;) { mbx = (uintptr_t)kg->kg_completed; if (suword(addr, mbx)) { PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (EFAULT); } PROC_LOCK(p); if (mbx == (uintptr_t)kg->kg_completed) { kg->kg_completed = NULL; PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } return (0); } /* * This function should be called at statclock interrupt time */ int thread_statclock(int user) { struct thread *td = curthread; if (td->td_ksegrp->kg_numupcalls == 0) return (-1); if (user) { /* Current always do via ast() */ mtx_lock_spin(&sched_lock); td->td_flags |= (TDF_USTATCLOCK|TDF_ASTPENDING); mtx_unlock_spin(&sched_lock); td->td_uuticks++; } else { if (td->td_mailbox != NULL) td->td_usticks++; else { /* XXXKSE * We will call thread_user_enter() for every * kernel entry in future, so if the thread mailbox * is NULL, it must be a UTS kernel, don't account * clock ticks for it. */ } } return (0); } /* * Export state clock ticks for userland */ static int thread_update_usr_ticks(struct thread *td, int user) { struct proc *p = td->td_proc; struct kse_thr_mailbox *tmbx; struct kse_upcall *ku; struct ksegrp *kg; caddr_t addr; uint uticks; if ((ku = td->td_upcall) == NULL) return (-1); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if ((tmbx == NULL) || (tmbx == (void *)-1)) return (-1); if (user) { uticks = td->td_uuticks; td->td_uuticks = 0; addr = (caddr_t)&tmbx->tm_uticks; } else { uticks = td->td_usticks; td->td_usticks = 0; addr = (caddr_t)&tmbx->tm_sticks; } if (uticks) { if (suword(addr, uticks+fuword(addr))) { PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (-2); } } kg = td->td_ksegrp; if (kg->kg_upquantum && ticks >= kg->kg_nextupcall) { mtx_lock_spin(&sched_lock); td->td_upcall->ku_flags |= KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); } return (0); } /* * Discard the current thread and exit from its context. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { struct thread *td; struct kse *ke; struct proc *p; struct ksegrp *kg; td = curthread; kg = td->td_ksegrp; p = td->td_proc; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); KASSERT(ke != NULL, ("thread exiting without a kse")); KASSERT(kg != NULL, ("thread exiting without a kse group")); PROC_LOCK_ASSERT(p, MA_OWNED); CTR1(KTR_PROC, "thread_exit: thread %p", td); KASSERT(!mtx_owned(&Giant), ("dying thread owns giant")); if (td->td_standin != NULL) { thread_stash(td->td_standin); td->td_standin = NULL; } cpu_thread_exit(td); /* XXXSMP */ /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff. */ if (p->p_numthreads > 1) { thread_unlink(td); if (p->p_maxthrwaits) wakeup(&p->p_numthreads); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SNGL is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_unsuspend_one(p->p_singlethread); } } /* * Because each upcall structure has an owner thread, * owner thread exits only when process is in exiting * state, so upcall to userland is no longer needed, * deleting upcall structure is safe here. * So when all threads in a group is exited, all upcalls * in the group should be automatically freed. */ if (td->td_upcall) upcall_remove(td); ke->ke_state = KES_UNQUEUED; ke->ke_thread = NULL; /* * Decide what to do with the KSE attached to this thread. */ if (ke->ke_flags & KEF_EXIT) kse_unlink(ke); else kse_reassign(ke); PROC_UNLOCK(p); td->td_kse = NULL; td->td_state = TDS_INACTIVE; #if 0 td->td_proc = NULL; #endif td->td_ksegrp = NULL; td->td_last_kse = NULL; PCPU_SET(deadthread, td); } else { PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ mtx_assert(&sched_lock, MA_OWNED); #if !defined(__alpha__) && !defined(__powerpc__) cpu_throw(td, choosethread()); #else cpu_throw(); #endif panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant held, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()")); KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()")); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_standin != NULL) { thread_free(td->td_standin); td->td_standin = NULL; } cpu_thread_clean(td); } thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. * * Note that we do not link to the proc's ucred here. * The thread is linked as if running but no KSE assigned. */ void thread_link(struct thread *td, struct ksegrp *kg) { struct proc *p; p = kg->kg_proc; td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_ksegrp = kg; td->td_last_kse = NULL; td->td_flags = 0; td->td_kse = NULL; LIST_INIT(&td->td_contested); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist); TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist); p->p_numthreads++; kg->kg_numthreads++; } void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; struct ksegrp *kg = td->td_ksegrp; mtx_assert(&sched_lock, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; TAILQ_REMOVE(&kg->kg_threads, td, td_kglist); kg->kg_numthreads--; /* could clear a few other things here */ } /* * Purge a ksegrp resource. When a ksegrp is preparing to * exit, it calls this function. */ static void kse_purge_group(struct thread *td) { struct ksegrp *kg; struct kse *ke; kg = td->td_ksegrp; KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__)); while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) { KASSERT(ke->ke_state == KES_IDLE, ("%s: wrong idle KSE state", __func__)); kse_unlink(ke); } KASSERT((kg->kg_kses == 1), ("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses)); KASSERT((kg->kg_numupcalls == 0), ("%s: ksegrp still has %d upcall datas", __func__, kg->kg_numupcalls)); } /* * Purge a process's KSE resource. When a process is preparing to * exit, it calls kse_purge to release any extra KSE resources in * the process. */ static void kse_purge(struct proc *p, struct thread *td) { struct ksegrp *kg; struct kse *ke; KASSERT(p->p_numthreads == 1, ("bad thread number")); while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) { TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp); p->p_numksegrps--; /* * There is no ownership for KSE, after all threads * in the group exited, it is possible that some KSEs * were left in idle queue, gc them now. */ while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) { KASSERT(ke->ke_state == KES_IDLE, ("%s: wrong idle KSE state", __func__)); TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses--; TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist); kg->kg_kses--; kse_stash(ke); } KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) || ((kg->kg_kses == 1) && (kg == td->td_ksegrp)), ("ksegrp has wrong kg_kses: %d", kg->kg_kses)); KASSERT((kg->kg_numupcalls == 0), ("%s: ksegrp still has %d upcall datas", __func__, kg->kg_numupcalls)); if (kg != td->td_ksegrp) ksegrp_stash(kg); } TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp); p->p_numksegrps++; } /* * This function is intended to be used to initialize a spare thread * for upcall. Initialize thread's large data area outside sched_lock * for thread_schedule_upcall(). */ void thread_alloc_spare(struct thread *td, struct thread *spare) { if (td->td_standin) return; if (spare == NULL) spare = thread_alloc(); td->td_standin = spare; bzero(&spare->td_startzero, (unsigned)RANGEOF(struct thread, td_startzero, td_endzero)); spare->td_proc = td->td_proc; spare->td_ucred = crhold(td->td_ucred); } /* * Create a thread and schedule it for upcall on the KSE given. * Use our thread's standin so that we don't have to allocate one. */ struct thread * thread_schedule_upcall(struct thread *td, struct kse_upcall *ku) { struct thread *td2; mtx_assert(&sched_lock, MA_OWNED); /* * Schedule an upcall thread on specified kse_upcall, * the kse_upcall must be free. * td must have a spare thread. */ KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__)); if ((td2 = td->td_standin) != NULL) { td->td_standin = NULL; } else { panic("no reserve thread when scheduling an upcall"); return (NULL); } CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)", td2, td->td_proc->p_pid, td->td_proc->p_comm); bcopy(&td->td_startcopy, &td2->td_startcopy, (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); thread_link(td2, ku->ku_ksegrp); /* inherit blocked thread's context */ cpu_set_upcall(td2, td); /* Let the new thread become owner of the upcall */ ku->ku_owner = td2; td2->td_upcall = ku; td2->td_flags = TDF_UPCALLING; #if 0 /* XXX This shouldn't be necessary */ if (td->td_proc->p_sflag & PS_NEEDSIGCHK) td2->td_flags |= TDF_ASTPENDING; #endif td2->td_kse = NULL; td2->td_state = TDS_CAN_RUN; td2->td_inhibitors = 0; setrunqueue(td2); return (td2); /* bogus.. should be a void function */ } void thread_signal_add(struct thread *td, int sig) { struct kse_upcall *ku; struct proc *p; sigset_t ss; int error; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&p->p_sigacts->ps_mtx, MA_OWNED); td = curthread; ku = td->td_upcall; mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); error = copyin(&ku->ku_mailbox->km_sigscaught, &ss, sizeof(sigset_t)); if (error) goto error; SIGADDSET(ss, sig); error = copyout(&ss, &ku->ku_mailbox->km_sigscaught, sizeof(sigset_t)); if (error) goto error; PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); return; error: PROC_LOCK(p); sigexit(td, SIGILL); } /* * Schedule an upcall to notify a KSE process recieved signals. * */ void thread_signal_upcall(struct thread *td) { mtx_lock_spin(&sched_lock); td->td_flags |= TDF_UPCALLING; mtx_unlock_spin(&sched_lock); return; } void thread_switchout(struct thread *td) { struct kse_upcall *ku; mtx_assert(&sched_lock, MA_OWNED); /* * If the outgoing thread is in threaded group and has never * scheduled an upcall, decide whether this is a short * or long term event and thus whether or not to schedule * an upcall. * If it is a short term event, just suspend it in * a way that takes its KSE with it. * Select the events for which we want to schedule upcalls. * For now it's just sleep. * XXXKSE eventually almost any inhibition could do. */ if (TD_CAN_UNBIND(td) && (td->td_standin) && TD_ON_SLEEPQ(td)) { /* * Release ownership of upcall, and schedule an upcall * thread, this new upcall thread becomes the owner of * the upcall structure. */ ku = td->td_upcall; ku->ku_owner = NULL; td->td_upcall = NULL; td->td_flags &= ~TDF_CAN_UNBIND; thread_schedule_upcall(td, ku); } } /* * Setup done on the thread when it enters the kernel. * XXXKSE Presently only for syscalls but eventually all kernel entries. */ void thread_user_enter(struct proc *p, struct thread *td) { struct ksegrp *kg; struct kse_upcall *ku; struct kse_thr_mailbox *tmbx; kg = td->td_ksegrp; /* * First check that we shouldn't just abort. * But check if we are the single thread first! */ PROC_LOCK(p); if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { mtx_lock_spin(&sched_lock); thread_stopped(p); thread_exit(); /* NOTREACHED */ } PROC_UNLOCK(p); /* * If we are doing a syscall in a KSE environment, * note where our mailbox is. There is always the * possibility that we could do this lazily (in kse_reassign()), * but for now do it every time. */ kg = td->td_ksegrp; if (kg->kg_numupcalls) { ku = td->td_upcall; KASSERT(ku, ("%s: no upcall owned", __func__)); KASSERT((ku->ku_owner == td), ("%s: wrong owner", __func__)); KASSERT(!TD_CAN_UNBIND(td), ("%s: can unbind", __func__)); ku->ku_mflags = fuword((void *)&ku->ku_mailbox->km_flags); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if ((tmbx == NULL) || (tmbx == (void *)-1)) { td->td_mailbox = NULL; } else { td->td_mailbox = tmbx; if (td->td_standin == NULL) thread_alloc_spare(td, NULL); mtx_lock_spin(&sched_lock); if (ku->ku_mflags & KMF_NOUPCALL) td->td_flags &= ~TDF_CAN_UNBIND; else td->td_flags |= TDF_CAN_UNBIND; mtx_unlock_spin(&sched_lock); } } } /* * The extra work we go through if we are a threaded process when we * return to userland. * * If we are a KSE process and returning to user mode, check for * extra work to do before we return (e.g. for more syscalls * to complete first). If we were in a critical section, we should * just return to let it finish. Same if we were in the UTS (in * which case the mailbox's context's busy indicator will be set). * The only traps we suport will have set the mailbox. * We will clear it here. */ int thread_userret(struct thread *td, struct trapframe *frame) { int error = 0, upcalls, uts_crit; struct kse_upcall *ku; struct ksegrp *kg, *kg2; struct proc *p; struct timespec ts; p = td->td_proc; kg = td->td_ksegrp; /* Nothing to do with non-threaded group/process */ if (td->td_ksegrp->kg_numupcalls == 0) return (0); /* * Stat clock interrupt hit in userland, it * is returning from interrupt, charge thread's * userland time for UTS. */ if (td->td_flags & TDF_USTATCLOCK) { thread_update_usr_ticks(td, 1); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_USTATCLOCK; mtx_unlock_spin(&sched_lock); if (kg->kg_completed || (td->td_upcall->ku_flags & KUF_DOUPCALL)) thread_user_enter(p, td); } uts_crit = (td->td_mailbox == NULL); ku = td->td_upcall; /* * Optimisation: * This thread has not started any upcall. * If there is no work to report other than ourself, * then it can return direct to userland. */ if (TD_CAN_UNBIND(td)) { mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_CAN_UNBIND; if ((td->td_flags & TDF_NEEDSIGCHK) == 0 && (kg->kg_completed == NULL) && (ku->ku_flags & KUF_DOUPCALL) == 0 && (kg->kg_upquantum && ticks < kg->kg_nextupcall)) { mtx_unlock_spin(&sched_lock); thread_update_usr_ticks(td, 0); nanotime(&ts); error = copyout(&ts, (caddr_t)&ku->ku_mailbox->km_timeofday, sizeof(ts)); td->td_mailbox = 0; ku->ku_mflags = 0; if (error) goto out; return (0); } mtx_unlock_spin(&sched_lock); error = thread_export_context(td); if (error) { /* * Failing to do the KSE operation just defaults * back to synchonous operation, so just return from * the syscall. */ goto out; } /* * There is something to report, and we own an upcall * strucuture, we can go to userland. * Turn ourself into an upcall thread. */ mtx_lock_spin(&sched_lock); td->td_flags |= TDF_UPCALLING; mtx_unlock_spin(&sched_lock); } else if (td->td_mailbox && (ku == NULL)) { error = thread_export_context(td); /* possibly upcall with error? */ PROC_LOCK(p); /* * There are upcall threads waiting for * work to do, wake one of them up. * XXXKSE Maybe wake all of them up. */ if (!error && kg->kg_upsleeps) wakeup_one(&kg->kg_completed); mtx_lock_spin(&sched_lock); thread_stopped(p); thread_exit(); /* NOTREACHED */ } KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind")); if (p->p_numthreads > max_threads_per_proc) { max_threads_hits++; PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_maxthrwaits++; while (p->p_numthreads > max_threads_per_proc) { upcalls = 0; FOREACH_KSEGRP_IN_PROC(p, kg2) { if (kg2->kg_numupcalls == 0) upcalls++; else upcalls += kg2->kg_numupcalls; } if (upcalls >= max_threads_per_proc) break; mtx_unlock_spin(&sched_lock); if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH, "maxthreads", NULL)) { mtx_lock_spin(&sched_lock); break; } else { mtx_lock_spin(&sched_lock); } } p->p_maxthrwaits--; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); } if (td->td_flags & TDF_UPCALLING) { uts_crit = 0; kg->kg_nextupcall = ticks+kg->kg_upquantum; /* * There is no more work to do and we are going to ride * this thread up to userland as an upcall. * Do the last parts of the setup needed for the upcall. */ CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_UPCALLING; if (ku->ku_flags & KUF_DOUPCALL) ku->ku_flags &= ~KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); /* * Set user context to the UTS */ if (!(ku->ku_mflags & KMF_NOUPCALL)) { cpu_set_upcall_kse(td, ku); error = suword(&ku->ku_mailbox->km_curthread, 0); if (error) goto out; } /* * Unhook the list of completed threads. * anything that completes after this gets to * come in next time. * Put the list of completed thread mailboxes on * this KSE's mailbox. */ if (!(ku->ku_mflags & KMF_NOCOMPLETED) && (error = thread_link_mboxes(kg, ku)) != 0) goto out; } if (!uts_crit) { nanotime(&ts); error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts)); } out: if (error) { /* * Things are going to be so screwed we should just kill * the process. * how do we do that? */ PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGSEGV); PROC_UNLOCK(td->td_proc); } else { /* * Optimisation: * Ensure that we have a spare thread available, * for when we re-enter the kernel. */ if (td->td_standin == NULL) thread_alloc_spare(td, NULL); } ku->ku_mflags = 0; /* * Clear thread mailbox first, then clear system tick count. * The order is important because thread_statclock() use * mailbox pointer to see if it is an userland thread or * an UTS kernel thread. */ td->td_mailbox = NULL; td->td_usticks = 0; return (error); /* go sync */ } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accellerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(int force_exit) { struct thread *td; struct thread *td2; struct proc *p; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((td != NULL), ("curthread is NULL")); - if ((p->p_flag & P_THREADED) == 0 && p->p_numthreads == 1) + if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1) return (0); /* Is someone already single threading? */ if (p->p_singlethread) return (1); if (force_exit == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; } else p->p_flag &= ~P_SINGLE_EXIT; p->p_flag |= P_STOPPED_SINGLE; mtx_lock_spin(&sched_lock); p->p_singlethread = td; while ((p->p_numthreads - p->p_suspcount) != 1) { FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; td2->td_flags |= TDF_ASTPENDING; if (TD_IS_INHIBITED(td2)) { if (force_exit == SINGLE_EXIT) { if (TD_IS_SUSPENDED(td2)) { thread_unsuspend_one(td2); } if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_CVWAITQ) cv_abort(td2); else abortsleep(td2); } } else { if (TD_IS_SUSPENDED(td2)) continue; /* * maybe other inhibitted states too? * XXXKSE Is it totally safe to * suspend a non-interruptable thread? */ if (td2->td_inhibitors & (TDI_SLEEPING | TDI_SWAPPED)) thread_suspend_one(td2); } } } /* * Maybe we suspended some threads.. was it enough? */ if ((p->p_numthreads - p->p_suspcount) == 1) break; /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_one(td); DROP_GIANT(); PROC_UNLOCK(p); p->p_stats->p_ru.ru_nvcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); mtx_lock_spin(&sched_lock); } if (force_exit == SINGLE_EXIT) { if (td->td_upcall) upcall_remove(td); kse_purge(p, td); } mtx_unlock_spin(&sched_lock); return (0); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediatly *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediatly * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); while (P_SHOULDSTOP(p)) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * XXX Should be safe to access unlocked * as it can only be set to be true by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if (return_instead) return (1); mtx_lock_spin(&sched_lock); thread_stopped(p); /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { while (mtx_owned(&Giant)) mtx_unlock(&Giant); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_exit(); else thr_exit1(); } /* * When a thread suspends, it just * moves to the processes's suspend queue * and stays there. */ thread_suspend_one(td); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_unsuspend_one(p->p_singlethread); } } DROP_GIANT(); PROC_UNLOCK(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); } return (0); } void thread_suspend_one(struct thread *td) { struct proc *p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; TD_SET_SUSPENDED(td); TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq); /* * Hack: If we are suspending but are on the sleep queue * then we are in msleep or the cv equivalent. We * want to look like we have two Inhibitors. * May already be set.. doesn't matter. */ if (TD_ON_SLEEPQ(td)) TD_SET_SLEEPING(td); } void thread_unsuspend_one(struct thread *td) { struct proc *p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_suspended, td, td_runq); TD_CLR_SUSPENDED(td); p->p_suspcount--; setrunnable(td); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if (!P_SHOULDSTOP(p)) { while (( td = TAILQ_FIRST(&p->p_suspended))) { thread_unsuspend_one(td); } } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) && (p->p_numthreads == p->p_suspcount)) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ thread_unsuspend_one(p->p_singlethread); } } void thread_single_end(void) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag &= ~P_STOPPED_SINGLE; mtx_lock_spin(&sched_lock); p->p_singlethread = NULL; /* * If there are other threads they mey now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) { while (( td = TAILQ_FIRST(&p->p_suspended))) { thread_unsuspend_one(td); } } mtx_unlock_spin(&sched_lock); } Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 116360) +++ head/sys/kern/kern_sig.c (revision 116361) @@ -1,2611 +1,2611 @@ /* * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined (__alpha__) && !defined(COMPAT_43) #error "You *really* need COMPAT_43 on the alpha for longjmp(3)" #endif #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ static int coredump(struct thread *); static char *expand_name(const char *, uid_t, pid_t); static int killpg1(struct thread *td, int sig, int pgid, int all); static int issignal(struct thread *p); static int sigprop(int sig); static void stop(struct proc *); static void tdsigwakeup(struct thread *td, int sig, sig_t action); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static int kern_sigtimedwait(struct thread *td, sigset_t set, siginfo_t *info, struct timespec *timeout); struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ #define SA_PROC 0x80 /* deliverable to any thread */ static int sigproptbl[NSIG] = { SA_KILL|SA_PROC, /* SIGHUP */ SA_KILL|SA_PROC, /* SIGINT */ SA_KILL|SA_CORE|SA_PROC, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE|SA_PROC, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL|SA_PROC, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL|SA_PROC, /* SIGPIPE */ SA_KILL|SA_PROC, /* SIGALRM */ SA_KILL|SA_PROC, /* SIGTERM */ SA_IGNORE|SA_PROC, /* SIGURG */ SA_STOP|SA_PROC, /* SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTSTP */ SA_IGNORE|SA_CONT|SA_PROC, /* SIGCONT */ SA_IGNORE|SA_PROC, /* SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTOU */ SA_IGNORE|SA_PROC, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL|SA_PROC, /* SIGVTALRM */ SA_KILL|SA_PROC, /* SIGPROF */ SA_IGNORE|SA_PROC, /* SIGWINCH */ SA_IGNORE|SA_PROC, /* SIGINFO */ SA_KILL|SA_PROC, /* SIGUSR1 */ SA_KILL|SA_PROC, /* SIGUSR2 */ }; /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). * XXXKSE the check for a pending stop is not done under KSE * * MP SAFE. */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); mtx_assert(&sched_lock, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_siglist or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; sigset_t set; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If our mask changed we may have to move signal that were * previously masked by all threads to our siglist. */ set = p->p_siglist; SIGSETNAND(set, td->td_sigmask); SIGSETNAND(p->p_siglist, set); SIGSETOR(td->td_siglist, set); if (SIGPENDING(td)) { mtx_lock_spin(&sched_lock); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; mtx_unlock_spin(&sched_lock); } } int sigonstack(size_t sp) { struct proc *p = curthread->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return ((p->p_flag & P_ALTSTACK) ? #if defined(COMPAT_43) || defined(COMPAT_SUNOS) ((p->p_sigstk.ss_size == 0) ? (p->p_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size)) #else ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction * * MPSAFE */ int kern_sigaction(td, sig, act, oact, flags) struct thread *td; register int sig; struct sigaction *act, *oact; int flags; { struct sigacts *ps; struct thread *td0; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) oact->sa_flags |= SA_SIGINFO; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (act->sa_flags & SA_SIGINFO) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!(act->sa_flags & SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (act->sa_flags & SA_ONSTACK) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (act->sa_flags & SA_RESETHAND) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (act->sa_flags & SA_NODEFER) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); #ifdef COMPAT_SUNOS if (act->sa_flags & SA_USERTRAMP) SIGADDSET(ps->ps_usertramp, sig); else SIGDELSET(ps->ps_usertramp, sig); #endif if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ SIGDELSET(p->p_siglist, sig); FOREACH_THREAD_IN_PROC(p, td0) SIGDELSET(td0->td_siglist, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif /* * MPSAFE */ int sigaction(td, uap) struct thread *td; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif /* * MPSAFE */ int freebsd4_sigaction(td, uap) struct thread *td; register struct freebsd4_sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif /* * MPSAFE */ int osigaction(td, uap) struct thread *td; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) && !defined(__alpha__) /* Avoid replicating the same stub everywhere */ int osigreturn(td, uap) struct thread *td; struct osigreturn_args *uap; { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(ps->ps_sigignore, i); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset signals for an exec of the specified process. */ void execsigs(p) register struct proc *p; { register struct sigacts *ps; register int sig; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); SIGDELSET(ps->ps_sigcatch, sig); if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(p->p_siglist, sig); /* * There is only one thread at this point. */ SIGDELSET(FIRST_THREAD_IN_PROC(p)->td_siglist, sig); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } /* * Clear out the td's sigmask. Normal processes use the proc sigmask. */ SIGEMPTYSET(FIRST_THREAD_IN_PROC(p)->td_sigmask); /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ p->p_sigstk.ss_flags = SS_DISABLE; p->p_sigstk.ss_size = 0; p->p_sigstk.ss_sp = 0; p->p_flag &= ~P_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(td, how, set, oset, old) struct thread *td; int how; sigset_t *set, *oset; int old; { int error; PROC_LOCK(td->td_proc); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); SIGSETOR(td->td_sigmask, *set); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); break; case SIG_SETMASK: SIG_CANTMASK(*set); if (old) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; signotify(td); break; default: error = EINVAL; break; } } PROC_UNLOCK(td->td_proc); return (error); } /* * sigprocmask() - MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sigprocmask(td, uap) register struct thread *td; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * osigprocmask() - MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(td, uap) register struct thread *td; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif /* * MPSAFE */ int sigwait(struct thread *td, struct sigwait_args *uap) { siginfo_t info; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &info, NULL); if (error) return (error); error = copyout(&info.si_signo, uap->sig, sizeof(info.si_signo)); /* Repost if we got an error. */ if (error && info.si_signo) { PROC_LOCK(td->td_proc); tdsignal(td, info.si_signo); PROC_UNLOCK(td->td_proc); } return (error); } /* * MPSAFE */ int sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; siginfo_t info; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &info, timeout); if (error) return (error); error = copyout(&info, uap->info, sizeof(info)); /* Repost if we got an error. */ if (error && info.si_signo) { PROC_LOCK(td->td_proc); tdsignal(td, info.si_signo); PROC_UNLOCK(td->td_proc); } return (error); } /* * MPSAFE */ int sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { siginfo_t info; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &info, NULL); if (error) return (error); error = copyout(&info, uap->info, sizeof(info)); /* Repost if we got an error. */ if (error && info.si_signo) { PROC_LOCK(td->td_proc); tdsignal(td, info.si_signo); PROC_UNLOCK(td->td_proc); } return (error); } static int kern_sigtimedwait(struct thread *td, sigset_t set, siginfo_t *info, struct timespec *timeout) { register struct sigacts *ps; sigset_t oldmask; struct proc *p; int error; int sig; int hz; p = td->td_proc; error = 0; sig = 0; SIG_CANTMASK(set); PROC_LOCK(p); ps = p->p_sigacts; oldmask = td->td_sigmask; td->td_sigmask = set; signotify(td); mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig) goto out; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout) { struct timeval tv; if (timeout->tv_nsec > 1000000000) { error = EINVAL; goto out; } TIMESPEC_TO_TIMEVAL(&tv, timeout); hz = tvtohz(&tv); } else hz = 0; mtx_unlock(&ps->ps_mtx); error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "pause", hz); mtx_lock(&ps->ps_mtx); if (error == EINTR) error = 0; else if (error) goto out; sig = cursig(td); out: td->td_sigmask = oldmask; if (sig) { sig_t action; action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, 0); #endif _STOPEVENT(p, S_SIG, sig); if (action == SIG_DFL) sigexit(td, sig); /* NOTREACHED */ SIGDELSET(td->td_siglist, sig); info->si_signo = sig; info->si_code = 0; } else mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (error); } /* * MPSAFE */ int sigpending(td, uap) struct thread *td; struct sigpending_args *uap; { struct proc *p = td->td_proc; sigset_t siglist; PROC_LOCK(p); siglist = p->p_siglist; SIGSETOR(siglist, td->td_siglist); PROC_UNLOCK(p); return (copyout(&siglist, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif /* * MPSAFE */ int osigpending(td, uap) struct thread *td; struct osigpending_args *uap; { struct proc *p = td->td_proc; sigset_t siglist; PROC_LOCK(p); siglist = p->p_siglist; SIGSETOR(siglist, td->td_siglist); PROC_UNLOCK(p); SIG2OSIG(siglist, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* * MPSAFE */ /* ARGSUSED */ int osigvec(td, uap) struct thread *td; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ #ifdef COMPAT_SUNOS nsap->sa_flags |= SA_USERTRAMP; #endif } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; #ifdef COMPAT_SUNOS vec.sv_flags &= ~SA_NOCLDSTOP; #endif error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif /* * MPSAFE */ int osigblock(td, uap) register struct thread *td; struct osigblock_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETOR(td->td_sigmask, set); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif /* * MPSAFE */ int osigsetmask(td, uap) struct thread *td; struct osigsetmask_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETLO(td->td_sigmask, set); signotify(td); PROC_UNLOCK(p); return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Suspend process until signal, providing mask to be set * in the meantime. Note nonstandard calling convention: * libc stub passes mask, not pointer, to save a copyin. ***** XXXKSE this doesn't make sense under KSE. ***** Do we suspend the thread or all threads in the process? ***** How do we suspend threads running NOW on another processor? */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sigsuspend(td, uap) struct thread *td; struct sigsuspend_args *uap; { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; SIG_CANTMASK(mask); td->td_sigmask = mask; signotify(td); while (msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* * MPSAFE */ /* ARGSUSED */ int osigsuspend(td, uap) struct thread *td; struct osigsuspend_args *uap; { struct proc *p = td->td_proc; sigset_t mask; PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; OSIG2SIG(uap->mask, mask); SIG_CANTMASK(mask); SIGSETLO(td->td_sigmask, mask); signotify(td); while (msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* * MPSAFE */ /* ARGSUSED */ int osigstack(td, uap) struct thread *td; register struct osigstack_args *uap; { struct proc *p = td->td_proc; struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } PROC_LOCK(p); oss.ss_sp = p->p_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { p->p_sigstk.ss_sp = nss.ss_sp; p->p_sigstk.ss_size = 0; p->p_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; p->p_flag |= P_ALTSTACK; } PROC_UNLOCK(p); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sigaltstack(td, uap) struct thread *td; register struct sigaltstack_args *uap; { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; PROC_LOCK(p); oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = p->p_sigstk; oss->ss_flags = (p->p_flag & P_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) { PROC_UNLOCK(p); return (EPERM); } if ((ss->ss_flags & ~SS_DISABLE) != 0) { PROC_UNLOCK(p); return (EINVAL); } if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) { PROC_UNLOCK(p); return (ENOMEM); } p->p_sigstk = *ss; p->p_flag |= P_ALTSTACK; } else { p->p_flag &= ~P_ALTSTACK; } } PROC_UNLOCK(p); return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(td, sig, pgid, all) register struct thread *td; int sig, pgid, all; { register struct proc *p; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) { PROC_UNLOCK(p); continue; } if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (nfound ? 0 : ESRCH); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* * MPSAFE */ /* ARGSUSED */ int kill(td, uap) register struct thread *td; register struct kill_args *uap; { register struct proc *p; int error; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0)); } /* NOTREACHED */ } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* * MPSAFE */ /* ARGSUSED */ int okillpg(td, uap) struct thread *td; register struct okillpg_args *uap; { if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); return (killpg1(td, uap->signum, uap->pgid, 0)); } #endif /* COMPAT_43 || COMPAT_SUNOS */ /* * Send a signal to a process group. */ void gsignal(pgid, sig) int pgid, sig; { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(pgrp, sig, checkctty) struct pgrp *pgrp; int sig, checkctty; { register struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (checkctty == 0 || p->p_flag & P_CONTROLT) psignal(p, sig); PROC_UNLOCK(p); } } } /* * Send a signal caused by a trap to the current thread. * If it will be caught immediately, deliver it with correct code. * Otherwise, post it normally. * * MPSAFE */ void trapsignal(struct thread *td, int sig, u_long code) { struct sigacts *ps; struct proc *p; p = td->td_proc; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { p->p_stats->p_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig, &td->td_sigmask, code); SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); } else { mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsignal(td, sig); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If we know the signal is bound for a specific thread then we * assume that we are in that threads context. This is the case * for SIGXCPU, SIGILL, etc. Otherwise someone did a kill() from * userland and the real thread doesn't actually matter. */ if ((prop & SA_PROC) != 0 && curthread->td_proc == p) return (curthread); /* * We should search for the first thread that is blocked in * sigsuspend with this signal unmasked. */ /* XXX */ /* * Find the first thread in the proc that doesn't have this signal * masked. */ FOREACH_THREAD_IN_PROC(p, td) if (!SIGISMEMBER(td->td_sigmask, sig)) return (td); return (FIRST_THREAD_IN_PROC(p)); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * MPSAFE */ void psignal(struct proc *p, int sig) { struct thread *td; int prop; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); /* * Find a thread to deliver the signal to. */ td = sigtd(p, sig, prop); tdsignal(td, sig); } /* * MPSAFE */ void tdsignal(struct thread *td, int sig) { struct proc *p; register sig_t action; sigset_t *siglist; struct thread *td0; register int prop; struct sigacts *ps; KASSERT(_SIG_VALID(sig), ("tdsignal(): invalid signal %d\n", sig)); p = td->td_proc; ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); KNOTE(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); /* * If this thread is blocking this signal then we'll leave it in the * proc so that we can find it in the first thread that unblocks it. */ if (SIGISMEMBER(td->td_sigmask, sig)) siglist = &p->p_siglist; else siglist = &td->td_siglist; /* * If proc is traced, always give parent a chance; * if signal event is tracked by procfs, give *that* * a chance, as well. */ if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) { action = SIG_DFL; } else { /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig) || (p->p_flag & P_WEXIT)) { mtx_unlock(&ps->ps_mtx); return; } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; mtx_unlock(&ps->ps_mtx); } if (prop & SA_CONT) { SIG_STOPSIGMASK(p->p_siglist); /* * XXX Should investigate leaving STOP and CONT sigs only in * the proc's siglist. */ FOREACH_THREAD_IN_PROC(p, td0) SIG_STOPSIGMASK(td0->td_siglist); } if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SA_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) return; SIG_CONTSIGMASK(p->p_siglist); FOREACH_THREAD_IN_PROC(p, td0) SIG_CONTSIGMASK(td0->td_siglist); p->p_flag &= ~P_CONTINUED; } SIGADDSET(*siglist, sig); signotify(td); /* uses schedlock */ /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) return; /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediatly, such as * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be * either winding down or already on the suspended queue. */ if (p->p_flag & P_TRACED) { /* * The traced process is already stopped, * so no further action is necessary. * No signal can restart us. */ goto out; } if (sig == SIGKILL) { /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED; goto runfast; } if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in siglist as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * siglist. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; p->p_flag |= P_CONTINUED; if (action == SIG_DFL) { SIGDELSET(*siglist, sig); } else if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? * It would seem that the answer would be to * run an upcall in the next KSE to run, and * deliver the signal that way. In a NON KSE * process, we need to make sure that the * single thread is runnable asap. * XXXKSE for now however, make them all run. */ goto runfast; } /* * The signal is not ignored or caught. */ mtx_lock_spin(&sched_lock); thread_unsuspend(p); mtx_unlock_spin(&sched_lock); goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; SIGDELSET(*siglist, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ mtx_lock_spin(&sched_lock); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) { if (td->td_flags & TDF_CVWAITQ) cv_abort(td); else abortsleep(td); } mtx_unlock_spin(&sched_lock); goto out; /* * XXXKSE What about threads that are waiting on mutexes? * Shouldn't they abort too? * No, hopefully mutexes are short lived.. They'll * eventually hit thread_suspend_check(). */ } else if (p->p_state == PRS_NORMAL) { if ((p->p_flag & P_TRACED) || (action != SIG_DFL) || !(prop & SA_STOP)) { mtx_lock_spin(&sched_lock); tdsigwakeup(td, sig, action); mtx_unlock_spin(&sched_lock); goto out; } if (prop & SA_STOP) { if (p->p_flag & P_PPWAIT) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td0) { if (TD_IS_SLEEPING(td0) && (td->td_flags & TDF_SINTR)) thread_suspend_one(td0); } thread_stopped(p); if (p->p_numthreads == p->p_suspcount) { SIGDELSET(p->p_siglist, p->p_xstat); FOREACH_THREAD_IN_PROC(p, td0) SIGDELSET(td0->td_siglist, p->p_xstat); } mtx_unlock_spin(&sched_lock); goto out; } else goto runfast; /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ SIGDELSET(*siglist, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: mtx_lock_spin(&sched_lock); tdsigwakeup(td, sig, action); thread_unsuspend(p); mtx_unlock_spin(&sched_lock); out: /* If we jump here, sched_lock should not be owned. */ mtx_assert(&sched_lock, MA_NOTOWNED); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action) { struct proc *p = td->td_proc; register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); prop = sigprop(sig); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. */ if ((action == SIG_DFL) && (prop & SA_KILL)) { if (td->td_priority > PUSER) { td->td_priority = PUSER; } } if (TD_IS_SLEEPING(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) { return; } /* * Process is sleeping and traced. Make it runnable * so it can discover the signal in issignal() and stop * for its parent. */ if (p->p_flag & P_TRACED) { p->p_flag &= ~P_STOPPED_TRACE; } else { /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { SIGDELSET(p->p_siglist, sig); /* * It may be on either list in this state. * Remove from both for now. */ SIGDELSET(td->td_siglist, sig); return; } /* * Raise priority to at least PUSER. */ if (td->td_priority > PUSER) { td->td_priority = PUSER; } } if (td->td_flags & TDF_CVWAITQ) cv_abort(td); else abortsleep(td); } #ifdef SMP else { /* * Other states do nothing with the signal immediatly, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ if (TD_IS_RUNNING(td) && td != curthread) { forward_signal(td); } } #endif } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(td) struct thread *td; { struct proc *p; struct sigacts *ps; sigset_t sigpending; register int sig, prop; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_siglist; SIGSETNAND(sigpending, td->td_sigmask); if (p->p_flag & P_PPWAIT) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); sig = sig_ffs(&sigpending); _STOPEVENT(p, S_SIG, sig); /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { SIGDELSET(td->td_siglist, sig); continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { /* * If traced, always stop. */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.mtx_object, "Stopping for traced signal"); p->p_xstat = sig; PROC_LOCK(p->p_pptr); psignal(p->p_pptr, SIGCHLD); PROC_UNLOCK(p->p_pptr); mtx_lock_spin(&sched_lock); stop(p); /* uses schedlock too eventually */ thread_suspend_one(td); PROC_UNLOCK(p); DROP_GIANT(); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); mtx_lock(&ps->ps_mtx); /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ SIGDELSET(td->td_siglist, sig); /* clear old signal */ sig = p->p_xstat; if (sig == 0) continue; /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; /* * Put the new signal into td_siglist. If the * signal is being masked, look for other signals. */ SIGADDSET(td->td_siglist, sig); if (SIGISMEMBER(td->td_sigmask, sig)) continue; signotify(td); } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & P_TRACED || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.mtx_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; mtx_lock_spin(&sched_lock); thread_stopped(p); thread_suspend_one(td); PROC_UNLOCK(p); DROP_GIANT(); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } SIGDELSET(td->td_siglist, sig); /* take the signal! */ } /* NOTREACHED */ } /* * Put the argument process into the stopped state and notify the parent * via wakeup. Signals are handled elsewhere. The process must not be * on the run queue. Must be called with the proc p locked and the scheduler * lock held. */ static void stop(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_STOPPED_SIG; p->p_flag &= ~P_WAITED; wakeup(p->p_pptr); } /* * MPSAFE */ void thread_stopped(struct proc *p) { struct proc *p1 = curthread->td_proc; struct sigacts *ps; int n; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); n = p->p_suspcount; if (p == p1) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { mtx_unlock_spin(&sched_lock); stop(p); PROC_LOCK(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); psignal(p->p_pptr, SIGCHLD); } else mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p->p_pptr); mtx_lock_spin(&sched_lock); } } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(sig) register int sig; { struct thread *td = curthread; register struct proc *p = td->td_proc; struct sigacts *ps; sig_t action; sigset_t returnmask; int code; KASSERT(sig != 0, ("postsig")); PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(td->td_siglist, sig); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, 0); #endif _STOPEVENT(p, S_SIG, sig); if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } p->p_stats->p_ru.ru_nsignals++; if (p->p_sig != sig) { code = 0; } else { code = p->p_code; p->p_code = 0; p->p_sig = 0; } - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_signal_add(curthread, sig); else (*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code); } } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. * * MPSAFE */ void sigexit(td, sig) struct thread *td; int sig; { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; if (sigprop(sig) & SA_CORE) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too */ PROC_UNLOCK(p); if (!mtx_owned(&Giant)) mtx_lock(&Giant); if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else { PROC_UNLOCK(p); if (!mtx_owned(&Giant)) mtx_lock(&Giant); } exit1(td, W_EXITCODE(0, sig)); /* NOTREACHED */ } static char corefilename[MAXPATHLEN+1] = {"%N.core"}; SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, sizeof(corefilename), "process corefile name format string"); /* * expand_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static char * expand_name(name, uid, pid) const char *name; uid_t uid; pid_t pid; { const char *format, *appendstr; char *temp; char buf[11]; /* Buffer for pid/uid -- max 4B */ size_t i, l, n; format = corefilename; temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); if (temp == NULL) return (NULL); for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': appendstr = "%"; break; case 'N': /* process name */ appendstr = name; break; case 'P': /* process id */ sprintf(buf, "%u", pid); appendstr = buf; break; case 'U': /* user id */ sprintf(buf, "%u", uid); appendstr = buf; break; default: appendstr = ""; log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); } l = strlen(appendstr); if ((n + l) >= MAXPATHLEN) goto toolong; memcpy(temp + n, appendstr, l); n += l; break; default: temp[n++] = format[i]; } } if (format[i] != '\0') goto toolong; return (temp); toolong: log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n", (long)pid, name, (u_long)uid); free(temp, M_TEMP); return (NULL); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; register struct vnode *vp; register struct ucred *cred = td->td_ucred; struct flock lf; struct nameidata nd; struct vattr vattr; int error, error1, flags; struct mount *mp; char *name; /* name of corefile */ off_t limit; PROC_LOCK(p); _STOPEVENT(p, S_CORE, 0); if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = p->p_rlimit[RLIMIT_CORE].rlim_cur; if (limit == 0) { PROC_UNLOCK(p); return 0; } PROC_UNLOCK(p); restart: name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid); if (name == NULL) return (EINVAL); NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */ flags = O_CREAT | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR); free(name, M_TEMP); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) { VOP_UNLOCK(vp, 0, td); error = EFAULT; goto out2; } VOP_UNLOCK(vp, 0, td); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK); if (error) goto out2; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); if ((error = vn_close(vp, FWRITE, cred, td)) != 0) return (error); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VOP_LEASE(vp, td, cred, LEASE_WRITE); VOP_SETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); error = p->p_sysent->sv_coredump ? p->p_sysent->sv_coredump(td, vp, limit) : ENOSYS; lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); vn_finished_write(mp); out2: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; return (error); } /* * Nonexistent system call-- signal process (may want to handle it). * Flag error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* * MPSAFE */ /* ARGSUSED */ int nosys(td, args) struct thread *td; struct nosys_args *args; { struct proc *p = td->td_proc; PROC_LOCK(p); psignal(p, SIGSYS); PROC_UNLOCK(p); return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using * stored credentials rather than those of the current process. */ void pgsigio(sigiop, sig, checkctty) struct sigio **sigiop; int sig, checkctty; { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ PROC_LOCK(p); SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); PROC_UNLOCK(p); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; PROC_LOCK(p); SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); PROC_UNLOCK(p); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); ps->ps_refcnt = 1; mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt--; if (ps->ps_refcnt == 0) { mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } else mtx_unlock(&ps->ps_mtx); } struct sigacts * sigacts_hold(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt++; mtx_unlock(&ps->ps_mtx); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { int shared; mtx_lock(&ps->ps_mtx); shared = ps->ps_refcnt > 1; mtx_unlock(&ps->ps_mtx); return (shared); } Index: head/sys/kern/kern_switch.c =================================================================== --- head/sys/kern/kern_switch.c (revision 116360) +++ head/sys/kern/kern_switch.c (revision 116361) @@ -1,723 +1,723 @@ /* * Copyright (c) 2001 Jake Burkholder * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*** Here is the logic.. If there are N processors, then there are at most N KSEs (kernel schedulable entities) working to process threads that belong to a KSEGOUP (kg). If there are X of these KSEs actually running at the moment in question, then there are at most M (N-X) of these KSEs on the run queue, as running KSEs are not on the queue. Runnable threads are queued off the KSEGROUP in priority order. If there are M or more threads runnable, the top M threads (by priority) are 'preassigned' to the M KSEs not running. The KSEs take their priority from those threads and are put on the run queue. The last thread that had a priority high enough to have a KSE associated with it, AND IS ON THE RUN QUEUE is pointed to by kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs assigned as all the available KSEs are activly running, or because there are no threads queued, that pointer is NULL. When a KSE is removed from the run queue to become runnable, we know it was associated with the highest priority thread in the queue (at the head of the queue). If it is also the last assigned we know M was 1 and must now be 0. Since the thread is no longer queued that pointer must be removed from it. Since we know there were no more KSEs available, (M was 1 and is now 0) and since we are not FREEING our KSE but using it, we know there are STILL no more KSEs available, we can prove that the next thread in the ksegrp list will not have a KSE to assign to it, so we can show that the pointer must be made 'invalid' (NULL). The pointer exists so that when a new thread is made runnable, it can have its priority compared with the last assigned thread to see if it should 'steal' its KSE or not.. i.e. is it 'earlier' on the list than that thread or later.. If it's earlier, then the KSE is removed from the last assigned (which is now not assigned a KSE) and reassigned to the new thread, which is placed earlier in the list. The pointer is then backed up to the previous thread (which may or may not be the new thread). When a thread sleeps or is removed, the KSE becomes available and if there are queued threads that are not assigned KSEs, the highest priority one of them is assigned the KSE, which is then placed back on the run queue at the approipriate place, and the kg->kg_last_assigned pointer is adjusted down to point to it. The following diagram shows 2 KSEs and 3 threads from a single process. RUNQ: --->KSE---KSE--... (KSEs queued at priorities from threads) \ \____ \ \ KSEGROUP---thread--thread--thread (queued in priority order) \ / \_______________/ (last_assigned) The result of this scheme is that the M available KSEs are always queued at the priorities they have inherrited from the M highest priority threads for that KSEGROUP. If this situation changes, the KSEs are reassigned to keep this true. ***/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #if defined(SMP) && defined(__i386__) #include #endif #include CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); void panc(char *string1, char *string2); #if 0 static void runq_readjust(struct runq *rq, struct kse *ke); #endif /************************************************************************ * Functions that manipulate runnability from a thread perspective. * ************************************************************************/ /* * Select the KSE that will be run next. From that find the thread, and * remove it from the KSEGRP's run queue. If there is thread clustering, * this will be what does it. */ struct thread * choosethread(void) { struct kse *ke; struct thread *td; struct ksegrp *kg; #if defined(SMP) && defined(__i386__) if (smp_active == 0 && PCPU_GET(cpuid) != 0) { /* Shutting down, run idlethread on AP's */ td = PCPU_GET(idlethread); ke = td->td_kse; CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); ke->ke_flags |= KEF_DIDRUN; TD_SET_RUNNING(td); return (td); } #endif retry: ke = sched_choose(); if (ke) { td = ke->ke_thread; KASSERT((td->td_kse == ke), ("kse/thread mismatch")); kg = ke->ke_ksegrp; - if (td->td_proc->p_flag & P_THREADED) { + if (td->td_proc->p_flag & P_SA) { if (kg->kg_last_assigned == td) { kg->kg_last_assigned = TAILQ_PREV(td, threadqueue, td_runq); } TAILQ_REMOVE(&kg->kg_runq, td, td_runq); } kg->kg_runnable--; CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d", td, td->td_priority); } else { /* Simulate runq_choose() having returned the idle thread */ td = PCPU_GET(idlethread); ke = td->td_kse; CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); } ke->ke_flags |= KEF_DIDRUN; /* * If we are in panic, only allow system threads, * plus the one we are running in, to be run. */ if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 && (td->td_flags & TDF_INPANIC) == 0)) { /* note that it is no longer on the run queue */ TD_SET_CAN_RUN(td); goto retry; } TD_SET_RUNNING(td); return (td); } /* * Given a surplus KSE, either assign a new runable thread to it * (and put it in the run queue) or put it in the ksegrp's idle KSE list. * Assumes that the original thread is not runnable. */ void kse_reassign(struct kse *ke) { struct ksegrp *kg; struct thread *td; struct thread *original; mtx_assert(&sched_lock, MA_OWNED); original = ke->ke_thread; KASSERT(original == NULL || TD_IS_INHIBITED(original), ("reassigning KSE with runnable thread")); kg = ke->ke_ksegrp; if (original) original->td_kse = NULL; /* * Find the first unassigned thread */ if ((td = kg->kg_last_assigned) != NULL) td = TAILQ_NEXT(td, td_runq); else td = TAILQ_FIRST(&kg->kg_runq); /* * If we found one, assign it the kse, otherwise idle the kse. */ if (td) { kg->kg_last_assigned = td; td->td_kse = ke; ke->ke_thread = td; sched_add(ke); CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td); return; } ke->ke_state = KES_IDLE; ke->ke_thread = NULL; TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses++; CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke); return; } #if 0 /* * Remove a thread from its KSEGRP's run queue. * This in turn may remove it from a KSE if it was already assigned * to one, possibly causing a new thread to be assigned to the KSE * and the KSE getting a new priority. */ static void remrunqueue(struct thread *td) { struct thread *td2, *td3; struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue")); kg = td->td_ksegrp; ke = td->td_kse; CTR1(KTR_RUNQ, "remrunqueue: td%p", td); kg->kg_runnable--; TD_SET_CAN_RUN(td); /* * If it is not a threaded process, take the shortcut. */ - if ((td->td_proc->p_flag & P_THREADED) == 0) { + if ((td->td_proc->p_flag & P_SA) == 0) { /* Bring its kse with it, leave the thread attached */ sched_rem(ke); ke->ke_state = KES_THREAD; return; } td3 = TAILQ_PREV(td, threadqueue, td_runq); TAILQ_REMOVE(&kg->kg_runq, td, td_runq); if (ke) { /* * This thread has been assigned to a KSE. * We need to dissociate it and try assign the * KSE to the next available thread. Then, we should * see if we need to move the KSE in the run queues. */ sched_rem(ke); ke->ke_state = KES_THREAD; td2 = kg->kg_last_assigned; KASSERT((td2 != NULL), ("last assigned has wrong value")); if (td2 == td) kg->kg_last_assigned = td3; kse_reassign(ke); } } #endif /* * Change the priority of a thread that is on the run queue. */ void adjustrunqueue( struct thread *td, int newpri) { struct ksegrp *kg; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue")); ke = td->td_kse; CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td); /* * If it is not a threaded process, take the shortcut. */ - if ((td->td_proc->p_flag & P_THREADED) == 0) { + if ((td->td_proc->p_flag & P_SA) == 0) { /* We only care about the kse in the run queue. */ td->td_priority = newpri; if (ke->ke_rqindex != (newpri / RQ_PPQ)) { sched_rem(ke); sched_add(ke); } return; } /* It is a threaded process */ kg = td->td_ksegrp; kg->kg_runnable--; TD_SET_CAN_RUN(td); if (ke) { if (kg->kg_last_assigned == td) { kg->kg_last_assigned = TAILQ_PREV(td, threadqueue, td_runq); } sched_rem(ke); } TAILQ_REMOVE(&kg->kg_runq, td, td_runq); td->td_priority = newpri; setrunqueue(td); } void setrunqueue(struct thread *td) { struct kse *ke; struct ksegrp *kg; struct thread *td2; struct thread *tda; CTR1(KTR_RUNQ, "setrunqueue: td%p", td); mtx_assert(&sched_lock, MA_OWNED); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("setrunqueue: bad thread state")); TD_SET_RUNQ(td); kg = td->td_ksegrp; kg->kg_runnable++; - if ((td->td_proc->p_flag & P_THREADED) == 0) { + if ((td->td_proc->p_flag & P_SA) == 0) { /* * Common path optimisation: Only one of everything * and the KSE is always already attached. * Totally ignore the ksegrp run queue. */ sched_add(td->td_kse); return; } tda = kg->kg_last_assigned; if ((ke = td->td_kse) == NULL) { if (kg->kg_idle_kses) { /* * There is a free one so it's ours for the asking.. */ ke = TAILQ_FIRST(&kg->kg_iq); TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); ke->ke_state = KES_THREAD; kg->kg_idle_kses--; } else if (tda && (tda->td_priority > td->td_priority)) { /* * None free, but there is one we can commandeer. */ ke = tda->td_kse; tda->td_kse = NULL; ke->ke_thread = NULL; tda = kg->kg_last_assigned = TAILQ_PREV(tda, threadqueue, td_runq); sched_rem(ke); } } else { /* * Temporarily disassociate so it looks like the other cases. */ ke->ke_thread = NULL; td->td_kse = NULL; } /* * Add the thread to the ksegrp's run queue at * the appropriate place. */ TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) { if (td2->td_priority > td->td_priority) { TAILQ_INSERT_BEFORE(td2, td, td_runq); break; } } if (td2 == NULL) { /* We ran off the end of the TAILQ or it was empty. */ TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq); } /* * If we have a ke to use, then put it on the run queue and * If needed, readjust the last_assigned pointer. */ if (ke) { if (tda == NULL) { /* * No pre-existing last assigned so whoever is first * gets the KSE we brought in.. (maybe us) */ td2 = TAILQ_FIRST(&kg->kg_runq); KASSERT((td2->td_kse == NULL), ("unexpected ke present")); td2->td_kse = ke; ke->ke_thread = td2; kg->kg_last_assigned = td2; } else if (tda->td_priority > td->td_priority) { /* * It's ours, grab it, but last_assigned is past us * so don't change it. */ td->td_kse = ke; ke->ke_thread = td; } else { /* * We are past last_assigned, so * put the new kse on whatever is next, * which may or may not be us. */ td2 = TAILQ_NEXT(tda, td_runq); kg->kg_last_assigned = td2; td2->td_kse = ke; ke->ke_thread = td2; } sched_add(ke); } } /************************************************************************ * Critical section marker functions * ************************************************************************/ /* Critical sections that prevent preemption. */ void critical_enter(void) { struct thread *td; td = curthread; if (td->td_critnest == 0) cpu_critical_enter(); td->td_critnest++; } void critical_exit(void) { struct thread *td; td = curthread; if (td->td_critnest == 1) { td->td_critnest = 0; cpu_critical_exit(); } else { td->td_critnest--; } } /************************************************************************ * SYSTEM RUN QUEUE manipulations and tests * ************************************************************************/ /* * Initialize a run structure. */ void runq_init(struct runq *rq) { int i; bzero(rq, sizeof *rq); for (i = 0; i < RQ_NQS; i++) TAILQ_INIT(&rq->rq_queues[i]); } /* * Clear the status bit of the queue corresponding to priority level pri, * indicating that it is empty. */ static __inline void runq_clrbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); } /* * Find the index of the first non-empty run queue. This is done by * scanning the status bits, a set bit indicates a non-empty queue. */ static __inline int runq_findbit(struct runq *rq) { struct rqbits *rqb; int pri; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", rqb->rqb_bits[i], i, pri); return (pri); } return (-1); } /* * Set the status bit of the queue corresponding to priority level pri, * indicating that it is non-empty. */ static __inline void runq_setbit(struct runq *rq, int pri) { struct rqbits *rqb; rqb = &rq->rq_status; CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", rqb->rqb_bits[RQB_WORD(pri)], rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), RQB_BIT(pri), RQB_WORD(pri)); rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); } /* * Add the KSE to the queue specified by its priority, and set the * corresponding status bit. */ void runq_add(struct runq *rq, struct kse *ke) { struct rqhead *rqh; int pri; pri = ke->ke_thread->td_priority / RQ_PPQ; ke->ke_rqindex = pri; runq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p", ke->ke_proc, ke->ke_thread->td_priority, pri, rqh); TAILQ_INSERT_TAIL(rqh, ke, ke_procq); } /* * Return true if there are runnable processes of any priority on the run * queue, false otherwise. Has no side effects, does not modify the run * queue structure. */ int runq_check(struct runq *rq) { struct rqbits *rqb; int i; rqb = &rq->rq_status; for (i = 0; i < RQB_LEN; i++) if (rqb->rqb_bits[i]) { CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", rqb->rqb_bits[i], i); return (1); } CTR0(KTR_RUNQ, "runq_check: empty"); return (0); } /* * Find the highest priority process on the run queue. */ struct kse * runq_choose(struct runq *rq) { struct rqhead *rqh; struct kse *ke; int pri; mtx_assert(&sched_lock, MA_OWNED); while ((pri = runq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; ke = TAILQ_FIRST(rqh); KASSERT(ke != NULL, ("runq_choose: no proc on busy queue")); CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh); return (ke); } CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri); return (NULL); } /* * Remove the KSE from the queue specified by its priority, and clear the * corresponding status bit if the queue becomes empty. * Caller must set ke->ke_state afterwards. */ void runq_remove(struct runq *rq, struct kse *ke) { struct rqhead *rqh; int pri; KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_remove: process swapped out")); pri = ke->ke_rqindex; rqh = &rq->rq_queues[pri]; CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p", ke, ke->ke_thread->td_priority, pri, rqh); KASSERT(ke != NULL, ("runq_remove: no proc on busy queue")); TAILQ_REMOVE(rqh, ke, ke_procq); if (TAILQ_EMPTY(rqh)) { CTR0(KTR_RUNQ, "runq_remove: empty"); runq_clrbit(rq, pri); } } #if 0 void panc(char *string1, char *string2) { printf("%s", string1); Debugger(string2); } void thread_sanity_check(struct thread *td, char *string) { struct proc *p; struct ksegrp *kg; struct kse *ke; struct thread *td2 = NULL; unsigned int prevpri; int saw_lastassigned = 0; int unassigned = 0; int assigned = 0; p = td->td_proc; kg = td->td_ksegrp; ke = td->td_kse; if (ke) { if (p != ke->ke_proc) { panc(string, "wrong proc"); } if (ke->ke_thread != td) { panc(string, "wrong thread"); } } - if ((p->p_flag & P_THREADED) == 0) { + if ((p->p_flag & P_SA) == 0) { if (ke == NULL) { panc(string, "non KSE thread lost kse"); } } else { prevpri = 0; saw_lastassigned = 0; unassigned = 0; assigned = 0; TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) { if (td2->td_priority < prevpri) { panc(string, "thread runqueue unosorted"); } if ((td2->td_state == TDS_RUNQ) && td2->td_kse && (td2->td_kse->ke_state != KES_ONRUNQ)) { panc(string, "KSE wrong state"); } prevpri = td2->td_priority; if (td2->td_kse) { assigned++; if (unassigned) { panc(string, "unassigned before assigned"); } if (kg->kg_last_assigned == NULL) { panc(string, "lastassigned corrupt"); } if (saw_lastassigned) { panc(string, "last assigned not last"); } if (td2->td_kse->ke_thread != td2) { panc(string, "mismatched kse/thread"); } } else { unassigned++; } if (td2 == kg->kg_last_assigned) { saw_lastassigned = 1; if (td2->td_kse == NULL) { panc(string, "last assigned not assigned"); } } } if (kg->kg_last_assigned && (saw_lastassigned == 0)) { panc(string, "where on earth does lastassigned point?"); } #if 0 FOREACH_THREAD_IN_GROUP(kg, td2) { if (((td2->td_flags & TDF_UNBOUND) == 0) && (TD_ON_RUNQ(td2))) { assigned++; if (td2->td_kse == NULL) { panc(string, "BOUND thread with no KSE"); } } } #endif #if 0 if ((unassigned + assigned) != kg->kg_runnable) { panc(string, "wrong number in runnable"); } #endif } if (assigned == 12345) { printf("%p %p %p %p %p %d, %d", td, td2, ke, kg, p, assigned, saw_lastassigned); } } #endif Index: head/sys/kern/kern_synch.c =================================================================== --- head/sys/kern/kern_synch.c (revision 116360) +++ head/sys/kern/kern_synch.c (revision 116361) @@ -1,695 +1,695 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #ifdef __i386__ #include "opt_swtch.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include #ifdef SWTCH_OPTIM_STATS #include #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) int hogticks; int lbolt; static struct callout loadav_callout; static struct callout lbolt_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ static int fscale __unused = FSCALE; SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); static void endtsleep(void *); static void loadav(void *arg); static void lboltcb(void *arg); /* * We're only looking at 7 bits of the address; everything is * aligned to 4, lots of things are aligned to greater powers * of 2. Shift right by 8, i.e. drop the bottom 256 worth. */ #define TABLESIZE 128 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) void sleepinit(void) { int i; hogticks = (hz / 10) * 2; /* Default only. */ for (i = 0; i < TABLESIZE; i++) TAILQ_INIT(&slpque[i]); } /* * General sleep call. Suspends the current process until a wakeup is * performed on the specified identifier. The process will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds * (0 means no timeout). If pri includes PCATCH flag, signals are checked * before and after sleeping, else signals are not checked. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal needs to be delivered, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The mutex argument is exited before the caller is suspended, and * entered before msleep returns. If priority includes the PDROP * flag the mutex is not entered before returning. */ int msleep(ident, mtx, priority, wmesg, timo) void *ident; struct mtx *mtx; int priority, timo; const char *wmesg; { struct thread *td = curthread; struct proc *p = td->td_proc; int sig, catch = priority & PCATCH; int rval = 0; WITNESS_SAVE_DECL(mtx); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0); #endif /* XXX: mtx == NULL ?? */ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mtx->mtx_object, "Sleeping on \"%s\"", wmesg); KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL, ("sleeping without a mutex")); /* * If we are capable of async syscalls and there isn't already * another one ready to return, start a new thread * and queue it as ready to run. Note that there is danger here * because we need to make sure that we don't sleep allocating * the thread (recursion here might be bad). */ mtx_lock_spin(&sched_lock); - if (p->p_flag & P_THREADED || p->p_numthreads > 1) { + if (p->p_flag & P_SA || p->p_numthreads > 1) { /* * Just don't bother if we are exiting * and not the exiting thread or thread was marked as * interrupted. */ if (catch && (((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) || (td->td_flags & TDF_INTERRUPT))) { td->td_flags &= ~TDF_INTERRUPT; mtx_unlock_spin(&sched_lock); return (EINTR); } } if (cold ) { /* * During autoconfiguration, just give interrupts * a chance, then just return. * Don't run any other procs or panic below, * in case this is the idle process and already asleep. */ if (mtx != NULL && priority & PDROP) mtx_unlock(mtx); mtx_unlock_spin(&sched_lock); return (0); } DROP_GIANT(); if (mtx != NULL) { mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->mtx_object, mtx); mtx_unlock(mtx); if (priority & PDROP) mtx = NULL; } KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)", td, p->p_pid, p->p_comm, wmesg, ident); td->td_wchan = ident; td->td_wmesg = wmesg; TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq); TD_SET_ON_SLEEPQ(td); if (timo) callout_reset(&td->td_slpcallout, timo, endtsleep, td); /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, and * a wakeup or a SIGCONT (or both) could occur while we were stopped. * without resuming us, thus we must be ready for sleep * when cursig is called. If the wakeup happens while we're * stopped, td->td_wchan will be 0 upon return from cursig. */ if (catch) { CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); td->td_flags |= TDF_SINTR; mtx_unlock_spin(&sched_lock); PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); sig = cursig(td); mtx_unlock(&p->p_sigacts->ps_mtx); if (sig == 0 && thread_suspend_check(1)) sig = SIGSTOP; mtx_lock_spin(&sched_lock); PROC_UNLOCK(p); if (sig != 0) { if (TD_ON_SLEEPQ(td)) unsleep(td); } else if (!TD_ON_SLEEPQ(td)) catch = 0; } else sig = 0; /* * Let the scheduler know we're about to voluntarily go to sleep. */ sched_sleep(td, priority & PRIMASK); if (TD_ON_SLEEPQ(td)) { p->p_stats->p_ru.ru_nvcsw++; TD_SET_SLEEPING(td); mi_switch(); } /* * We're awake from voluntary sleep. */ CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); td->td_flags &= ~TDF_SINTR; if (td->td_flags & TDF_TIMEOUT) { td->td_flags &= ~TDF_TIMEOUT; if (sig == 0) rval = EWOULDBLOCK; } else if (td->td_flags & TDF_TIMOFAIL) { td->td_flags &= ~TDF_TIMOFAIL; } else if (timo && callout_stop(&td->td_slpcallout) == 0) { /* * This isn't supposed to be pretty. If we are here, then * the endtsleep() callout is currently executing on another * CPU and is either spinning on the sched_lock or will be * soon. If we don't synchronize here, there is a chance * that this process may msleep() again before the callout * has a chance to run and the callout may end up waking up * the wrong msleep(). Yuck. */ TD_SET_SLEEPING(td); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); td->td_flags &= ~TDF_TIMOFAIL; } if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) && (rval == 0)) { td->td_flags &= ~TDF_INTERRUPT; rval = EINTR; } mtx_unlock_spin(&sched_lock); if (rval == 0 && catch) { PROC_LOCK(p); /* XXX: shouldn't we always be calling cursig() */ mtx_lock(&p->p_sigacts->ps_mtx); if (sig != 0 || (sig = cursig(td))) { if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) rval = EINTR; else rval = ERESTART; } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0); #endif PICKUP_GIANT(); if (mtx != NULL) { mtx_lock(mtx); WITNESS_RESTORE(&mtx->mtx_object, mtx); } return (rval); } /* * Implement timeout for msleep() * * If process hasn't been awakened (wchan non-zero), * set timeout flag and undo the sleep. If proc * is stopped, just unsleep so it will remain stopped. * MP-safe, called without the Giant mutex. */ static void endtsleep(arg) void *arg; { register struct thread *td = arg; CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); mtx_lock_spin(&sched_lock); /* * This is the other half of the synchronization with msleep() * described above. If the TDS_TIMEOUT flag is set, we lost the * race and just need to put the process back on the runqueue. */ if (TD_ON_SLEEPQ(td)) { TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); TD_CLR_ON_SLEEPQ(td); td->td_flags |= TDF_TIMEOUT; td->td_wmesg = NULL; } else { td->td_flags |= TDF_TIMOFAIL; } TD_CLR_SLEEPING(td); setrunnable(td); mtx_unlock_spin(&sched_lock); } /* * Abort a thread, as if an interrupt had occured. Only abort * interruptable waits (unfortunatly it isn't only safe to abort others). * This is about identical to cv_abort(). * Think about merging them? * Also, whatever the signal code does... */ void abortsleep(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) { if (TD_ON_SLEEPQ(td)) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); } } } /* * Remove a process from its wait queue */ void unsleep(struct thread *td) { mtx_lock_spin(&sched_lock); if (TD_ON_SLEEPQ(td)) { TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq); TD_CLR_ON_SLEEPQ(td); td->td_wmesg = NULL; } mtx_unlock_spin(&sched_lock); } /* * Make all processes sleeping on the specified identifier runnable. */ void wakeup(ident) register void *ident; { register struct slpquehead *qp; register struct thread *td; struct thread *ntd; struct proc *p; mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; restart: for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { ntd = TAILQ_NEXT(td, td_slpq); if (td->td_wchan == ident) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); p = td->td_proc; CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); goto restart; } } mtx_unlock_spin(&sched_lock); } /* * Make a process sleeping on the specified identifier runnable. * May wake more than one process if a target process is currently * swapped out. */ void wakeup_one(ident) register void *ident; { register struct slpquehead *qp; register struct thread *td; register struct proc *p; struct thread *ntd; mtx_lock_spin(&sched_lock); qp = &slpque[LOOKUP(ident)]; for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { ntd = TAILQ_NEXT(td, td_slpq); if (td->td_wchan == ident) { unsleep(td); TD_CLR_SLEEPING(td); setrunnable(td); p = td->td_proc; CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); break; } } mtx_unlock_spin(&sched_lock); } /* * The machine independent parts of mi_switch(). */ void mi_switch(void) { struct bintime new_switchtime; struct thread *td; #if !defined(__alpha__) && !defined(__powerpc__) struct thread *newtd; #endif struct proc *p; u_int sched_nest; mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); td = curthread; /* XXX */ p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1, ("mi_switch: switch in a critical section")); /* * Compute the amount of time during which the current * process was running, and add that to its total so far. */ binuptime(&new_switchtime); bintime_add(&p->p_runtime, &new_switchtime); bintime_sub(&p->p_runtime, PCPU_PTR(switchtime)); #ifdef DDB /* * Don't perform context switches from the debugger. */ if (db_active) { mtx_unlock_spin(&sched_lock); db_print_backtrace(); db_error("Context switches not allowed in the debugger."); } #endif /* * Check if the process exceeds its cpu resource allocation. If * over max, arrange to kill the process in ast(). */ if (p->p_cpulimit != RLIM_INFINITY && p->p_runtime.sec > p->p_cpulimit) { p->p_sflag |= PS_XCPU; td->td_flags |= TDF_ASTPENDING; } /* * Finish up stats for outgoing thread. */ cnt.v_swtch++; PCPU_SET(switchtime, new_switchtime); CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); sched_nest = sched_lock.mtx_recurse; - if (td->td_proc->p_flag & P_THREADED) + if (td->td_proc->p_flag & P_SA) thread_switchout(td); sched_switchout(td); #if !defined(__alpha__) && !defined(__powerpc__) newtd = choosethread(); if (td != newtd) cpu_switch(td, newtd); /* SHAZAM!! */ #ifdef SWTCH_OPTIM_STATS else stupid_switch++; #endif #else cpu_switch(); /* SHAZAM!!*/ #endif sched_lock.mtx_recurse = sched_nest; sched_lock.mtx_lock = (uintptr_t)td; sched_switchin(td); /* * Start setting up stats etc. for the incoming thread. * Similar code in fork_exit() is returned to by cpu_switch() * in the case of a new thread/process. */ CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); if (PCPU_GET(switchtime.sec) == 0) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); /* * Call the switchin function while still holding the scheduler lock * (used by the idlezero code and the general page-zeroing code) */ if (td->td_switchin) td->td_switchin(); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change process state to be runnable, * placing it on the run queue if it is in memory, * and awakening the swapper if it isn't in memory. */ void setrunnable(struct thread *td) { struct proc *p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); switch (p->p_state) { case PRS_ZOMBIE: panic("setrunnable(1)"); default: break; } switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return; case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return; /* XXX: intentional fall-through ? */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((p->p_sflag & PS_INMEM) == 0) { if ((p->p_sflag & PS_SWAPPINGIN) == 0) { p->p_sflag |= PS_SWAPINREQ; wakeup(&proc0); } } else sched_wakeup(td); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. * XXXKSE Needs complete rewrite when correct info is available. * Completely Bogus.. only works with 1:1 (but compiles ok now :-) */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; struct proc *p; struct thread *td; avg = &averunnable; sx_slock(&allproc_lock); nrun = 0; FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { switch (td->td_state) { case TDS_RUNQ: case TDS_RUNNING: if ((p->p_flag & P_NOLOAD) != 0) goto nextproc; nrun++; /* XXXKSE */ default: break; } nextproc: continue; } } sx_sunlock(&allproc_lock); for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), loadav, NULL); } static void lboltcb(void *arg) { wakeup(&lbolt); callout_reset(&lbolt_callout, hz, lboltcb, NULL); } /* ARGSUSED */ static void sched_setup(dummy) void *dummy; { callout_init(&loadav_callout, 0); callout_init(&lbolt_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); lboltcb(NULL); } /* * General purpose yield system call */ int yield(struct thread *td, struct yield_args *uap) { struct ksegrp *kg = td->td_ksegrp; mtx_assert(&Giant, MA_NOTOWNED); mtx_lock_spin(&sched_lock); kg->kg_proc->p_stats->p_ru.ru_nvcsw++; sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(); mtx_unlock_spin(&sched_lock); td->td_retval[0] = 0; return (0); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 116360) +++ head/sys/kern/kern_thread.c (revision 116361) @@ -1,2022 +1,2022 @@ /* * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * KSEGRP related storage. */ static uma_zone_t ksegrp_zone; static uma_zone_t kse_zone; static uma_zone_t thread_zone; static uma_zone_t upcall_zone; /* DEBUG ONLY */ SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation"); static int thread_debug = 0; SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW, &thread_debug, 0, "thread debug"); static int max_threads_per_proc = 150; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW, &max_threads_per_proc, 0, "Limit on threads per proc"); static int max_groups_per_proc = 50; SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW, &max_groups_per_proc, 0, "Limit on thread groups per proc"); static int max_threads_hits; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD, &max_threads_hits, 0, ""); static int virtual_cpu; #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses); TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps); TAILQ_HEAD(, kse_upcall) zombie_upcalls = TAILQ_HEAD_INITIALIZER(zombie_upcalls); struct mtx kse_zombie_lock; MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN); static void kse_purge(struct proc *p, struct thread *td); static void kse_purge_group(struct thread *td); static int thread_update_usr_ticks(struct thread *td, int user); static void thread_alloc_spare(struct thread *td, struct thread *spare); static int sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS) { int error, new_val; int def_val; #ifdef SMP def_val = mp_ncpus; #else def_val = 1; #endif if (virtual_cpu == 0) new_val = def_val; else new_val = virtual_cpu; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < 0) return (EINVAL); virtual_cpu = new_val; return (0); } /* DEBUG ONLY */ SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I", "debug virtual cpus"); /* * Prepare a thread for use. */ static void thread_ctor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif } /* * Initialize type-stable parts of a thread (when newly created). */ static void thread_init(void *mem, int size) { struct thread *td; td = (struct thread *)mem; mtx_lock(&Giant); vm_thread_new(td, 0); mtx_unlock(&Giant); cpu_thread_setup(td); td->td_sched = (struct td_sched *)&td[1]; } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; vm_thread_dispose(td); } /* * Initialize type-stable parts of a kse (when newly created). */ static void kse_init(void *mem, int size) { struct kse *ke; ke = (struct kse *)mem; ke->ke_sched = (struct ke_sched *)&ke[1]; } /* * Initialize type-stable parts of a ksegrp (when newly created). */ static void ksegrp_init(void *mem, int size) { struct ksegrp *kg; kg = (struct ksegrp *)mem; kg->kg_sched = (struct kg_sched *)&kg[1]; } /* * KSE is linked into kse group. */ void kse_link(struct kse *ke, struct ksegrp *kg) { struct proc *p = kg->kg_proc; TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist); kg->kg_kses++; ke->ke_state = KES_UNQUEUED; ke->ke_proc = p; ke->ke_ksegrp = kg; ke->ke_thread = NULL; ke->ke_oncpu = NOCPU; ke->ke_flags = 0; } void kse_unlink(struct kse *ke) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = ke->ke_ksegrp; TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist); if (ke->ke_state == KES_IDLE) { TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses--; } if (--kg->kg_kses == 0) ksegrp_unlink(kg); /* * Aggregate stats from the KSE */ kse_stash(ke); } void ksegrp_link(struct ksegrp *kg, struct proc *p) { TAILQ_INIT(&kg->kg_threads); TAILQ_INIT(&kg->kg_runq); /* links with td_runq */ TAILQ_INIT(&kg->kg_slpq); /* links with td_runq */ TAILQ_INIT(&kg->kg_kseq); /* all kses in ksegrp */ TAILQ_INIT(&kg->kg_iq); /* all idle kses in ksegrp */ TAILQ_INIT(&kg->kg_upcalls); /* all upcall structure in ksegrp */ kg->kg_proc = p; /* * the following counters are in the -zero- section * and may not need clearing */ kg->kg_numthreads = 0; kg->kg_runnable = 0; kg->kg_kses = 0; kg->kg_runq_kses = 0; /* XXXKSE change name */ kg->kg_idle_kses = 0; kg->kg_numupcalls = 0; /* link it in now that it's consistent */ p->p_numksegrps++; TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp); } void ksegrp_unlink(struct ksegrp *kg) { struct proc *p; mtx_assert(&sched_lock, MA_OWNED); KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads")); KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses")); KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls")); p = kg->kg_proc; TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp); p->p_numksegrps--; /* * Aggregate stats from the KSE */ ksegrp_stash(kg); } struct kse_upcall * upcall_alloc(void) { struct kse_upcall *ku; ku = uma_zalloc(upcall_zone, M_WAITOK); bzero(ku, sizeof(*ku)); return (ku); } void upcall_free(struct kse_upcall *ku) { uma_zfree(upcall_zone, ku); } void upcall_link(struct kse_upcall *ku, struct ksegrp *kg) { mtx_assert(&sched_lock, MA_OWNED); TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link); ku->ku_ksegrp = kg; kg->kg_numupcalls++; } void upcall_unlink(struct kse_upcall *ku) { struct ksegrp *kg = ku->ku_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__)); TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link); kg->kg_numupcalls--; upcall_stash(ku); } void upcall_remove(struct thread *td) { if (td->td_upcall) { td->td_upcall->ku_owner = NULL; upcall_unlink(td->td_upcall); td->td_upcall = 0; } } /* * For a newly created process, * link up all the structures and its initial threads etc. */ void proc_linkup(struct proc *p, struct ksegrp *kg, struct kse *ke, struct thread *td) { TAILQ_INIT(&p->p_ksegrps); /* all ksegrps in proc */ TAILQ_INIT(&p->p_threads); /* all threads in proc */ TAILQ_INIT(&p->p_suspended); /* Threads suspended */ p->p_numksegrps = 0; p->p_numthreads = 0; ksegrp_link(kg, p); kse_link(ke, kg); thread_link(td, kg); } /* struct kse_thr_interrupt_args { struct kse_thr_mailbox * tmbx; }; */ int kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap) { struct proc *p; struct thread *td2; p = td->td_proc; - if (!(p->p_flag & P_THREADED) || (uap->tmbx == NULL)) + if (!(p->p_flag & P_SA) || (uap->tmbx == NULL)) return (EINVAL); mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td2) { if (td2->td_mailbox == uap->tmbx) { td2->td_flags |= TDF_INTERRUPT; if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_CVWAITQ) cv_abort(td2); else abortsleep(td2); } mtx_unlock_spin(&sched_lock); return (0); } } mtx_unlock_spin(&sched_lock); return (ESRCH); } /* struct kse_exit_args { register_t dummy; }; */ int kse_exit(struct thread *td, struct kse_exit_args *uap) { struct proc *p; struct ksegrp *kg; struct kse *ke; struct kse_upcall *ku, *ku2; int error, count; p = td->td_proc; if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) return (EINVAL); kg = td->td_ksegrp; count = 0; PROC_LOCK(p); mtx_lock_spin(&sched_lock); FOREACH_UPCALL_IN_GROUP(kg, ku2) { if (ku2->ku_flags & KUF_EXITING) count++; } if ((kg->kg_numupcalls - count) == 1 && (kg->kg_numthreads > 1)) { mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (EDEADLK); } ku->ku_flags |= KUF_EXITING; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); error = suword(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE); PROC_LOCK(p); if (error) psignal(p, SIGSEGV); mtx_lock_spin(&sched_lock); upcall_remove(td); ke = td->td_kse; if (p->p_numthreads == 1) { kse_purge(p, td); - p->p_flag &= ~P_THREADED; + p->p_flag &= ~P_SA; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); } else { if (kg->kg_numthreads == 1) { /* Shutdown a group */ kse_purge_group(td); ke->ke_flags |= KEF_EXIT; } thread_stopped(p); thread_exit(); /* NOTREACHED */ } return (0); } /* * Either becomes an upcall or waits for an awakening event and * then becomes an upcall. Only error cases return. */ /* struct kse_release_args { struct timespec *timeout; }; */ int kse_release(struct thread *td, struct kse_release_args *uap) { struct proc *p; struct ksegrp *kg; struct timespec ts, ts2, ts3, timeout; struct timeval tv; int error; p = td->td_proc; kg = td->td_ksegrp; if (td->td_upcall == NULL || TD_CAN_UNBIND(td)) return (EINVAL); if (uap->timeout != NULL) { if ((error = copyin(uap->timeout, &timeout, sizeof(timeout)))) return (error); getnanouptime(&ts); timespecadd(&ts, &timeout); TIMESPEC_TO_TIMEVAL(&tv, &timeout); } mtx_lock_spin(&sched_lock); /* Change OURSELF to become an upcall. */ td->td_flags = TDF_UPCALLING; #if 0 /* XXX This shouldn't be necessary */ if (p->p_sflag & PS_NEEDSIGCHK) td->td_flags |= TDF_ASTPENDING; #endif mtx_unlock_spin(&sched_lock); PROC_LOCK(p); while ((td->td_upcall->ku_flags & KUF_DOUPCALL) == 0 && (kg->kg_completed == NULL)) { kg->kg_upsleeps++; error = msleep(&kg->kg_completed, &p->p_mtx, PPAUSE|PCATCH, "kse_rel", (uap->timeout ? tvtohz(&tv) : 0)); kg->kg_upsleeps--; PROC_UNLOCK(p); if (uap->timeout == NULL || error != EWOULDBLOCK) return (0); getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) return (0); ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); PROC_LOCK(p); } PROC_UNLOCK(p); return (0); } /* struct kse_wakeup_args { struct kse_mailbox *mbx; }; */ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap) { struct proc *p; struct ksegrp *kg; struct kse_upcall *ku; struct thread *td2; p = td->td_proc; td2 = NULL; ku = NULL; /* KSE-enabled processes only, please. */ - if (!(p->p_flag & P_THREADED)) + if (!(p->p_flag & P_SA)) return (EINVAL); PROC_LOCK(p); mtx_lock_spin(&sched_lock); if (uap->mbx) { FOREACH_KSEGRP_IN_PROC(p, kg) { FOREACH_UPCALL_IN_GROUP(kg, ku) { if (ku->ku_mailbox == uap->mbx) break; } if (ku) break; } } else { kg = td->td_ksegrp; if (kg->kg_upsleeps) { wakeup_one(&kg->kg_completed); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (0); } ku = TAILQ_FIRST(&kg->kg_upcalls); } if (ku) { if ((td2 = ku->ku_owner) == NULL) { panic("%s: no owner", __func__); } else if (TD_ON_SLEEPQ(td2) && (td2->td_wchan == &kg->kg_completed)) { abortsleep(td2); } else { ku->ku_flags |= KUF_DOUPCALL; } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (0); } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (ESRCH); } /* * No new KSEG: first call: use current KSE, don't schedule an upcall * All other situations, do allocate max new KSEs and schedule an upcall. */ /* struct kse_create_args { struct kse_mailbox *mbx; int newgroup; }; */ int kse_create(struct thread *td, struct kse_create_args *uap) { struct kse *newke; struct ksegrp *newkg; struct ksegrp *kg; struct proc *p; struct kse_mailbox mbx; struct kse_upcall *newku; int err, ncpus; p = td->td_proc; if ((err = copyin(uap->mbx, &mbx, sizeof(mbx)))) return (err); /* Too bad, why hasn't kernel always a cpu counter !? */ #ifdef SMP ncpus = mp_ncpus; #else ncpus = 1; #endif if (thread_debug && virtual_cpu != 0) ncpus = virtual_cpu; /* Easier to just set it than to test and set */ PROC_LOCK(p); - p->p_flag |= P_THREADED; + p->p_flag |= P_SA; PROC_UNLOCK(p); kg = td->td_ksegrp; if (uap->newgroup) { /* Have race condition but it is cheap */ if (p->p_numksegrps >= max_groups_per_proc) return (EPROCLIM); /* * If we want a new KSEGRP it doesn't matter whether * we have already fired up KSE mode before or not. * We put the process in KSE mode and create a new KSEGRP. */ newkg = ksegrp_alloc(); bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); bcopy(&kg->kg_startcopy, &newkg->kg_startcopy, RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); mtx_lock_spin(&sched_lock); if (p->p_numksegrps >= max_groups_per_proc) { mtx_unlock_spin(&sched_lock); ksegrp_free(newkg); return (EPROCLIM); } ksegrp_link(newkg, p); mtx_unlock_spin(&sched_lock); } else { newkg = kg; } /* * Creating upcalls more than number of physical cpu does * not help performance. */ if (newkg->kg_numupcalls >= ncpus) return (EPROCLIM); if (newkg->kg_numupcalls == 0) { /* * Initialize KSE group, optimized for MP. * Create KSEs as many as physical cpus, this increases * concurrent even if userland is not MP safe and can only run * on single CPU (for early version of libpthread, it is true). * In ideal world, every physical cpu should execute a thread. * If there is enough KSEs, threads in kernel can be * executed parallel on different cpus with full speed, * Concurrent in kernel shouldn't be restricted by number of * upcalls userland provides. * Adding more upcall structures only increases concurrent * in userland. * Highest performance configuration is: * N kses = N upcalls = N phyiscal cpus */ while (newkg->kg_kses < ncpus) { newke = kse_alloc(); bzero(&newke->ke_startzero, RANGEOF(struct kse, ke_startzero, ke_endzero)); #if 0 mtx_lock_spin(&sched_lock); bcopy(&ke->ke_startcopy, &newke->ke_startcopy, RANGEOF(struct kse, ke_startcopy, ke_endcopy)); mtx_unlock_spin(&sched_lock); #endif mtx_lock_spin(&sched_lock); kse_link(newke, newkg); /* Add engine */ kse_reassign(newke); mtx_unlock_spin(&sched_lock); } } newku = upcall_alloc(); newku->ku_mailbox = uap->mbx; newku->ku_func = mbx.km_func; bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t)); /* For the first call this may not have been set */ if (td->td_standin == NULL) thread_alloc_spare(td, NULL); mtx_lock_spin(&sched_lock); if (newkg->kg_numupcalls >= ncpus) { mtx_unlock_spin(&sched_lock); upcall_free(newku); return (EPROCLIM); } upcall_link(newku, newkg); if (mbx.km_quantum) newkg->kg_upquantum = max(1, mbx.km_quantum/tick); /* * Each upcall structure has an owner thread, find which * one owns it. */ if (uap->newgroup) { /* * Because new ksegrp hasn't thread, * create an initial upcall thread to own it. */ thread_schedule_upcall(td, newku); } else { /* * If current thread hasn't an upcall structure, * just assign the upcall to it. */ if (td->td_upcall == NULL) { newku->ku_owner = td; td->td_upcall = newku; } else { /* * Create a new upcall thread to own it. */ thread_schedule_upcall(td, newku); } } mtx_unlock_spin(&sched_lock); return (0); } /* * Initialize global thread allocation resources. */ void threadinit(void) { thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, UMA_ALIGN_CACHE, 0); ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(), NULL, NULL, ksegrp_init, NULL, UMA_ALIGN_CACHE, 0); kse_zone = uma_zcreate("KSE", sched_sizeof_kse(), NULL, NULL, kse_init, NULL, UMA_ALIGN_CACHE, 0); upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); } /* * Stash an embarasingly extra thread into the zombie thread queue. */ void thread_stash(struct thread *td) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra kse into the zombie kse queue. */ void kse_stash(struct kse *ke) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra upcall into the zombie upcall queue. */ void upcall_stash(struct kse_upcall *ku) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link); mtx_unlock_spin(&kse_zombie_lock); } /* * Stash an embarasingly extra ksegrp into the zombie ksegrp queue. */ void ksegrp_stash(struct ksegrp *kg) { mtx_lock_spin(&kse_zombie_lock); TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp); mtx_unlock_spin(&kse_zombie_lock); } /* * Reap zombie kse resource. */ void thread_reap(void) { struct thread *td_first, *td_next; struct kse *ke_first, *ke_next; struct ksegrp *kg_first, * kg_next; struct kse_upcall *ku_first, *ku_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant.. */ if ((!TAILQ_EMPTY(&zombie_threads)) || (!TAILQ_EMPTY(&zombie_kses)) || (!TAILQ_EMPTY(&zombie_ksegrps)) || (!TAILQ_EMPTY(&zombie_upcalls))) { mtx_lock_spin(&kse_zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); ke_first = TAILQ_FIRST(&zombie_kses); kg_first = TAILQ_FIRST(&zombie_ksegrps); ku_first = TAILQ_FIRST(&zombie_upcalls); if (td_first) TAILQ_INIT(&zombie_threads); if (ke_first) TAILQ_INIT(&zombie_kses); if (kg_first) TAILQ_INIT(&zombie_ksegrps); if (ku_first) TAILQ_INIT(&zombie_upcalls); mtx_unlock_spin(&kse_zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_runq); if (td_first->td_ucred) crfree(td_first->td_ucred); thread_free(td_first); td_first = td_next; } while (ke_first) { ke_next = TAILQ_NEXT(ke_first, ke_procq); kse_free(ke_first); ke_first = ke_next; } while (kg_first) { kg_next = TAILQ_NEXT(kg_first, kg_ksegrp); ksegrp_free(kg_first); kg_first = kg_next; } while (ku_first) { ku_next = TAILQ_NEXT(ku_first, ku_link); upcall_free(ku_first); ku_first = ku_next; } } } /* * Allocate a ksegrp. */ struct ksegrp * ksegrp_alloc(void) { return (uma_zalloc(ksegrp_zone, M_WAITOK)); } /* * Allocate a kse. */ struct kse * kse_alloc(void) { return (uma_zalloc(kse_zone, M_WAITOK)); } /* * Allocate a thread. */ struct thread * thread_alloc(void) { thread_reap(); /* check if any zombies to get */ return (uma_zalloc(thread_zone, M_WAITOK)); } /* * Deallocate a ksegrp. */ void ksegrp_free(struct ksegrp *td) { uma_zfree(ksegrp_zone, td); } /* * Deallocate a kse. */ void kse_free(struct kse *td) { uma_zfree(kse_zone, td); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { cpu_thread_clean(td); uma_zfree(thread_zone, td); } /* * Store the thread context in the UTS's mailbox. * then add the mailbox at the head of a list we are building in user space. * The list is anchored in the ksegrp structure. */ int thread_export_context(struct thread *td) { struct proc *p; struct ksegrp *kg; uintptr_t mbx; void *addr; int error,temp; mcontext_t mc; p = td->td_proc; kg = td->td_ksegrp; /* Export the user/machine context. */ get_mcontext(td, &mc, 0); addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext); error = copyout(&mc, addr, sizeof(mcontext_t)); if (error) goto bad; /* Exports clock ticks in kernel mode */ addr = (caddr_t)(&td->td_mailbox->tm_sticks); temp = fuword(addr) + td->td_usticks; if (suword(addr, temp)) { error = EFAULT; goto bad; } /* Get address in latest mbox of list pointer */ addr = (void *)(&td->td_mailbox->tm_next); /* * Put the saved address of the previous first * entry into this one */ for (;;) { mbx = (uintptr_t)kg->kg_completed; if (suword(addr, mbx)) { error = EFAULT; goto bad; } PROC_LOCK(p); if (mbx == (uintptr_t)kg->kg_completed) { kg->kg_completed = td->td_mailbox; /* * The thread context may be taken away by * other upcall threads when we unlock * process lock. it's no longer valid to * use it again in any other places. */ td->td_mailbox = NULL; PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } td->td_usticks = 0; return (0); bad: PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); /* The mailbox is bad, don't use it */ td->td_mailbox = NULL; td->td_usticks = 0; return (error); } /* * Take the list of completed mailboxes for this KSEGRP and put them on this * upcall's mailbox as it's the next one going up. */ static int thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku) { struct proc *p = kg->kg_proc; void *addr; uintptr_t mbx; addr = (void *)(&ku->ku_mailbox->km_completed); for (;;) { mbx = (uintptr_t)kg->kg_completed; if (suword(addr, mbx)) { PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (EFAULT); } PROC_LOCK(p); if (mbx == (uintptr_t)kg->kg_completed) { kg->kg_completed = NULL; PROC_UNLOCK(p); break; } PROC_UNLOCK(p); } return (0); } /* * This function should be called at statclock interrupt time */ int thread_statclock(int user) { struct thread *td = curthread; if (td->td_ksegrp->kg_numupcalls == 0) return (-1); if (user) { /* Current always do via ast() */ mtx_lock_spin(&sched_lock); td->td_flags |= (TDF_USTATCLOCK|TDF_ASTPENDING); mtx_unlock_spin(&sched_lock); td->td_uuticks++; } else { if (td->td_mailbox != NULL) td->td_usticks++; else { /* XXXKSE * We will call thread_user_enter() for every * kernel entry in future, so if the thread mailbox * is NULL, it must be a UTS kernel, don't account * clock ticks for it. */ } } return (0); } /* * Export state clock ticks for userland */ static int thread_update_usr_ticks(struct thread *td, int user) { struct proc *p = td->td_proc; struct kse_thr_mailbox *tmbx; struct kse_upcall *ku; struct ksegrp *kg; caddr_t addr; uint uticks; if ((ku = td->td_upcall) == NULL) return (-1); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if ((tmbx == NULL) || (tmbx == (void *)-1)) return (-1); if (user) { uticks = td->td_uuticks; td->td_uuticks = 0; addr = (caddr_t)&tmbx->tm_uticks; } else { uticks = td->td_usticks; td->td_usticks = 0; addr = (caddr_t)&tmbx->tm_sticks; } if (uticks) { if (suword(addr, uticks+fuword(addr))) { PROC_LOCK(p); psignal(p, SIGSEGV); PROC_UNLOCK(p); return (-2); } } kg = td->td_ksegrp; if (kg->kg_upquantum && ticks >= kg->kg_nextupcall) { mtx_lock_spin(&sched_lock); td->td_upcall->ku_flags |= KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); } return (0); } /* * Discard the current thread and exit from its context. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { struct thread *td; struct kse *ke; struct proc *p; struct ksegrp *kg; td = curthread; kg = td->td_ksegrp; p = td->td_proc; ke = td->td_kse; mtx_assert(&sched_lock, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); KASSERT(ke != NULL, ("thread exiting without a kse")); KASSERT(kg != NULL, ("thread exiting without a kse group")); PROC_LOCK_ASSERT(p, MA_OWNED); CTR1(KTR_PROC, "thread_exit: thread %p", td); KASSERT(!mtx_owned(&Giant), ("dying thread owns giant")); if (td->td_standin != NULL) { thread_stash(td->td_standin); td->td_standin = NULL; } cpu_thread_exit(td); /* XXXSMP */ /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff. */ if (p->p_numthreads > 1) { thread_unlink(td); if (p->p_maxthrwaits) wakeup(&p->p_numthreads); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SNGL is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_unsuspend_one(p->p_singlethread); } } /* * Because each upcall structure has an owner thread, * owner thread exits only when process is in exiting * state, so upcall to userland is no longer needed, * deleting upcall structure is safe here. * So when all threads in a group is exited, all upcalls * in the group should be automatically freed. */ if (td->td_upcall) upcall_remove(td); ke->ke_state = KES_UNQUEUED; ke->ke_thread = NULL; /* * Decide what to do with the KSE attached to this thread. */ if (ke->ke_flags & KEF_EXIT) kse_unlink(ke); else kse_reassign(ke); PROC_UNLOCK(p); td->td_kse = NULL; td->td_state = TDS_INACTIVE; #if 0 td->td_proc = NULL; #endif td->td_ksegrp = NULL; td->td_last_kse = NULL; PCPU_SET(deadthread, td); } else { PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ mtx_assert(&sched_lock, MA_OWNED); #if !defined(__alpha__) && !defined(__powerpc__) cpu_throw(td, choosethread()); #else cpu_throw(); #endif panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant held, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()")); KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()")); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_standin != NULL) { thread_free(td->td_standin); td->td_standin = NULL; } cpu_thread_clean(td); } thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. * * Note that we do not link to the proc's ucred here. * The thread is linked as if running but no KSE assigned. */ void thread_link(struct thread *td, struct ksegrp *kg) { struct proc *p; p = kg->kg_proc; td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_ksegrp = kg; td->td_last_kse = NULL; td->td_flags = 0; td->td_kse = NULL; LIST_INIT(&td->td_contested); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist); TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist); p->p_numthreads++; kg->kg_numthreads++; } void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; struct ksegrp *kg = td->td_ksegrp; mtx_assert(&sched_lock, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; TAILQ_REMOVE(&kg->kg_threads, td, td_kglist); kg->kg_numthreads--; /* could clear a few other things here */ } /* * Purge a ksegrp resource. When a ksegrp is preparing to * exit, it calls this function. */ static void kse_purge_group(struct thread *td) { struct ksegrp *kg; struct kse *ke; kg = td->td_ksegrp; KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__)); while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) { KASSERT(ke->ke_state == KES_IDLE, ("%s: wrong idle KSE state", __func__)); kse_unlink(ke); } KASSERT((kg->kg_kses == 1), ("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses)); KASSERT((kg->kg_numupcalls == 0), ("%s: ksegrp still has %d upcall datas", __func__, kg->kg_numupcalls)); } /* * Purge a process's KSE resource. When a process is preparing to * exit, it calls kse_purge to release any extra KSE resources in * the process. */ static void kse_purge(struct proc *p, struct thread *td) { struct ksegrp *kg; struct kse *ke; KASSERT(p->p_numthreads == 1, ("bad thread number")); while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) { TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp); p->p_numksegrps--; /* * There is no ownership for KSE, after all threads * in the group exited, it is possible that some KSEs * were left in idle queue, gc them now. */ while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) { KASSERT(ke->ke_state == KES_IDLE, ("%s: wrong idle KSE state", __func__)); TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist); kg->kg_idle_kses--; TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist); kg->kg_kses--; kse_stash(ke); } KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) || ((kg->kg_kses == 1) && (kg == td->td_ksegrp)), ("ksegrp has wrong kg_kses: %d", kg->kg_kses)); KASSERT((kg->kg_numupcalls == 0), ("%s: ksegrp still has %d upcall datas", __func__, kg->kg_numupcalls)); if (kg != td->td_ksegrp) ksegrp_stash(kg); } TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp); p->p_numksegrps++; } /* * This function is intended to be used to initialize a spare thread * for upcall. Initialize thread's large data area outside sched_lock * for thread_schedule_upcall(). */ void thread_alloc_spare(struct thread *td, struct thread *spare) { if (td->td_standin) return; if (spare == NULL) spare = thread_alloc(); td->td_standin = spare; bzero(&spare->td_startzero, (unsigned)RANGEOF(struct thread, td_startzero, td_endzero)); spare->td_proc = td->td_proc; spare->td_ucred = crhold(td->td_ucred); } /* * Create a thread and schedule it for upcall on the KSE given. * Use our thread's standin so that we don't have to allocate one. */ struct thread * thread_schedule_upcall(struct thread *td, struct kse_upcall *ku) { struct thread *td2; mtx_assert(&sched_lock, MA_OWNED); /* * Schedule an upcall thread on specified kse_upcall, * the kse_upcall must be free. * td must have a spare thread. */ KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__)); if ((td2 = td->td_standin) != NULL) { td->td_standin = NULL; } else { panic("no reserve thread when scheduling an upcall"); return (NULL); } CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)", td2, td->td_proc->p_pid, td->td_proc->p_comm); bcopy(&td->td_startcopy, &td2->td_startcopy, (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); thread_link(td2, ku->ku_ksegrp); /* inherit blocked thread's context */ cpu_set_upcall(td2, td); /* Let the new thread become owner of the upcall */ ku->ku_owner = td2; td2->td_upcall = ku; td2->td_flags = TDF_UPCALLING; #if 0 /* XXX This shouldn't be necessary */ if (td->td_proc->p_sflag & PS_NEEDSIGCHK) td2->td_flags |= TDF_ASTPENDING; #endif td2->td_kse = NULL; td2->td_state = TDS_CAN_RUN; td2->td_inhibitors = 0; setrunqueue(td2); return (td2); /* bogus.. should be a void function */ } void thread_signal_add(struct thread *td, int sig) { struct kse_upcall *ku; struct proc *p; sigset_t ss; int error; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&p->p_sigacts->ps_mtx, MA_OWNED); td = curthread; ku = td->td_upcall; mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); error = copyin(&ku->ku_mailbox->km_sigscaught, &ss, sizeof(sigset_t)); if (error) goto error; SIGADDSET(ss, sig); error = copyout(&ss, &ku->ku_mailbox->km_sigscaught, sizeof(sigset_t)); if (error) goto error; PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); return; error: PROC_LOCK(p); sigexit(td, SIGILL); } /* * Schedule an upcall to notify a KSE process recieved signals. * */ void thread_signal_upcall(struct thread *td) { mtx_lock_spin(&sched_lock); td->td_flags |= TDF_UPCALLING; mtx_unlock_spin(&sched_lock); return; } void thread_switchout(struct thread *td) { struct kse_upcall *ku; mtx_assert(&sched_lock, MA_OWNED); /* * If the outgoing thread is in threaded group and has never * scheduled an upcall, decide whether this is a short * or long term event and thus whether or not to schedule * an upcall. * If it is a short term event, just suspend it in * a way that takes its KSE with it. * Select the events for which we want to schedule upcalls. * For now it's just sleep. * XXXKSE eventually almost any inhibition could do. */ if (TD_CAN_UNBIND(td) && (td->td_standin) && TD_ON_SLEEPQ(td)) { /* * Release ownership of upcall, and schedule an upcall * thread, this new upcall thread becomes the owner of * the upcall structure. */ ku = td->td_upcall; ku->ku_owner = NULL; td->td_upcall = NULL; td->td_flags &= ~TDF_CAN_UNBIND; thread_schedule_upcall(td, ku); } } /* * Setup done on the thread when it enters the kernel. * XXXKSE Presently only for syscalls but eventually all kernel entries. */ void thread_user_enter(struct proc *p, struct thread *td) { struct ksegrp *kg; struct kse_upcall *ku; struct kse_thr_mailbox *tmbx; kg = td->td_ksegrp; /* * First check that we shouldn't just abort. * But check if we are the single thread first! */ PROC_LOCK(p); if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { mtx_lock_spin(&sched_lock); thread_stopped(p); thread_exit(); /* NOTREACHED */ } PROC_UNLOCK(p); /* * If we are doing a syscall in a KSE environment, * note where our mailbox is. There is always the * possibility that we could do this lazily (in kse_reassign()), * but for now do it every time. */ kg = td->td_ksegrp; if (kg->kg_numupcalls) { ku = td->td_upcall; KASSERT(ku, ("%s: no upcall owned", __func__)); KASSERT((ku->ku_owner == td), ("%s: wrong owner", __func__)); KASSERT(!TD_CAN_UNBIND(td), ("%s: can unbind", __func__)); ku->ku_mflags = fuword((void *)&ku->ku_mailbox->km_flags); tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread); if ((tmbx == NULL) || (tmbx == (void *)-1)) { td->td_mailbox = NULL; } else { td->td_mailbox = tmbx; if (td->td_standin == NULL) thread_alloc_spare(td, NULL); mtx_lock_spin(&sched_lock); if (ku->ku_mflags & KMF_NOUPCALL) td->td_flags &= ~TDF_CAN_UNBIND; else td->td_flags |= TDF_CAN_UNBIND; mtx_unlock_spin(&sched_lock); } } } /* * The extra work we go through if we are a threaded process when we * return to userland. * * If we are a KSE process and returning to user mode, check for * extra work to do before we return (e.g. for more syscalls * to complete first). If we were in a critical section, we should * just return to let it finish. Same if we were in the UTS (in * which case the mailbox's context's busy indicator will be set). * The only traps we suport will have set the mailbox. * We will clear it here. */ int thread_userret(struct thread *td, struct trapframe *frame) { int error = 0, upcalls, uts_crit; struct kse_upcall *ku; struct ksegrp *kg, *kg2; struct proc *p; struct timespec ts; p = td->td_proc; kg = td->td_ksegrp; /* Nothing to do with non-threaded group/process */ if (td->td_ksegrp->kg_numupcalls == 0) return (0); /* * Stat clock interrupt hit in userland, it * is returning from interrupt, charge thread's * userland time for UTS. */ if (td->td_flags & TDF_USTATCLOCK) { thread_update_usr_ticks(td, 1); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_USTATCLOCK; mtx_unlock_spin(&sched_lock); if (kg->kg_completed || (td->td_upcall->ku_flags & KUF_DOUPCALL)) thread_user_enter(p, td); } uts_crit = (td->td_mailbox == NULL); ku = td->td_upcall; /* * Optimisation: * This thread has not started any upcall. * If there is no work to report other than ourself, * then it can return direct to userland. */ if (TD_CAN_UNBIND(td)) { mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_CAN_UNBIND; if ((td->td_flags & TDF_NEEDSIGCHK) == 0 && (kg->kg_completed == NULL) && (ku->ku_flags & KUF_DOUPCALL) == 0 && (kg->kg_upquantum && ticks < kg->kg_nextupcall)) { mtx_unlock_spin(&sched_lock); thread_update_usr_ticks(td, 0); nanotime(&ts); error = copyout(&ts, (caddr_t)&ku->ku_mailbox->km_timeofday, sizeof(ts)); td->td_mailbox = 0; ku->ku_mflags = 0; if (error) goto out; return (0); } mtx_unlock_spin(&sched_lock); error = thread_export_context(td); if (error) { /* * Failing to do the KSE operation just defaults * back to synchonous operation, so just return from * the syscall. */ goto out; } /* * There is something to report, and we own an upcall * strucuture, we can go to userland. * Turn ourself into an upcall thread. */ mtx_lock_spin(&sched_lock); td->td_flags |= TDF_UPCALLING; mtx_unlock_spin(&sched_lock); } else if (td->td_mailbox && (ku == NULL)) { error = thread_export_context(td); /* possibly upcall with error? */ PROC_LOCK(p); /* * There are upcall threads waiting for * work to do, wake one of them up. * XXXKSE Maybe wake all of them up. */ if (!error && kg->kg_upsleeps) wakeup_one(&kg->kg_completed); mtx_lock_spin(&sched_lock); thread_stopped(p); thread_exit(); /* NOTREACHED */ } KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind")); if (p->p_numthreads > max_threads_per_proc) { max_threads_hits++; PROC_LOCK(p); mtx_lock_spin(&sched_lock); p->p_maxthrwaits++; while (p->p_numthreads > max_threads_per_proc) { upcalls = 0; FOREACH_KSEGRP_IN_PROC(p, kg2) { if (kg2->kg_numupcalls == 0) upcalls++; else upcalls += kg2->kg_numupcalls; } if (upcalls >= max_threads_per_proc) break; mtx_unlock_spin(&sched_lock); if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH, "maxthreads", NULL)) { mtx_lock_spin(&sched_lock); break; } else { mtx_lock_spin(&sched_lock); } } p->p_maxthrwaits--; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); } if (td->td_flags & TDF_UPCALLING) { uts_crit = 0; kg->kg_nextupcall = ticks+kg->kg_upquantum; /* * There is no more work to do and we are going to ride * this thread up to userland as an upcall. * Do the last parts of the setup needed for the upcall. */ CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_UPCALLING; if (ku->ku_flags & KUF_DOUPCALL) ku->ku_flags &= ~KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); /* * Set user context to the UTS */ if (!(ku->ku_mflags & KMF_NOUPCALL)) { cpu_set_upcall_kse(td, ku); error = suword(&ku->ku_mailbox->km_curthread, 0); if (error) goto out; } /* * Unhook the list of completed threads. * anything that completes after this gets to * come in next time. * Put the list of completed thread mailboxes on * this KSE's mailbox. */ if (!(ku->ku_mflags & KMF_NOCOMPLETED) && (error = thread_link_mboxes(kg, ku)) != 0) goto out; } if (!uts_crit) { nanotime(&ts); error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts)); } out: if (error) { /* * Things are going to be so screwed we should just kill * the process. * how do we do that? */ PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGSEGV); PROC_UNLOCK(td->td_proc); } else { /* * Optimisation: * Ensure that we have a spare thread available, * for when we re-enter the kernel. */ if (td->td_standin == NULL) thread_alloc_spare(td, NULL); } ku->ku_mflags = 0; /* * Clear thread mailbox first, then clear system tick count. * The order is important because thread_statclock() use * mailbox pointer to see if it is an userland thread or * an UTS kernel thread. */ td->td_mailbox = NULL; td->td_usticks = 0; return (error); /* go sync */ } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accellerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(int force_exit) { struct thread *td; struct thread *td2; struct proc *p; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((td != NULL), ("curthread is NULL")); - if ((p->p_flag & P_THREADED) == 0 && p->p_numthreads == 1) + if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1) return (0); /* Is someone already single threading? */ if (p->p_singlethread) return (1); if (force_exit == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; } else p->p_flag &= ~P_SINGLE_EXIT; p->p_flag |= P_STOPPED_SINGLE; mtx_lock_spin(&sched_lock); p->p_singlethread = td; while ((p->p_numthreads - p->p_suspcount) != 1) { FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; td2->td_flags |= TDF_ASTPENDING; if (TD_IS_INHIBITED(td2)) { if (force_exit == SINGLE_EXIT) { if (TD_IS_SUSPENDED(td2)) { thread_unsuspend_one(td2); } if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_CVWAITQ) cv_abort(td2); else abortsleep(td2); } } else { if (TD_IS_SUSPENDED(td2)) continue; /* * maybe other inhibitted states too? * XXXKSE Is it totally safe to * suspend a non-interruptable thread? */ if (td2->td_inhibitors & (TDI_SLEEPING | TDI_SWAPPED)) thread_suspend_one(td2); } } } /* * Maybe we suspended some threads.. was it enough? */ if ((p->p_numthreads - p->p_suspcount) == 1) break; /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_one(td); DROP_GIANT(); PROC_UNLOCK(p); p->p_stats->p_ru.ru_nvcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); mtx_lock_spin(&sched_lock); } if (force_exit == SINGLE_EXIT) { if (td->td_upcall) upcall_remove(td); kse_purge(p, td); } mtx_unlock_spin(&sched_lock); return (0); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediatly *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediatly * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); while (P_SHOULDSTOP(p)) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * XXX Should be safe to access unlocked * as it can only be set to be true by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if (return_instead) return (1); mtx_lock_spin(&sched_lock); thread_stopped(p); /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { while (mtx_owned(&Giant)) mtx_unlock(&Giant); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_exit(); else thr_exit1(); } /* * When a thread suspends, it just * moves to the processes's suspend queue * and stays there. */ thread_suspend_one(td); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_unsuspend_one(p->p_singlethread); } } DROP_GIANT(); PROC_UNLOCK(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); } return (0); } void thread_suspend_one(struct thread *td) { struct proc *p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; TD_SET_SUSPENDED(td); TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq); /* * Hack: If we are suspending but are on the sleep queue * then we are in msleep or the cv equivalent. We * want to look like we have two Inhibitors. * May already be set.. doesn't matter. */ if (TD_ON_SLEEPQ(td)) TD_SET_SLEEPING(td); } void thread_unsuspend_one(struct thread *td) { struct proc *p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_suspended, td, td_runq); TD_CLR_SUSPENDED(td); p->p_suspcount--; setrunnable(td); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; mtx_assert(&sched_lock, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if (!P_SHOULDSTOP(p)) { while (( td = TAILQ_FIRST(&p->p_suspended))) { thread_unsuspend_one(td); } } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) && (p->p_numthreads == p->p_suspcount)) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ thread_unsuspend_one(p->p_singlethread); } } void thread_single_end(void) { struct thread *td; struct proc *p; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag &= ~P_STOPPED_SINGLE; mtx_lock_spin(&sched_lock); p->p_singlethread = NULL; /* * If there are other threads they mey now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) { while (( td = TAILQ_FIRST(&p->p_suspended))) { thread_unsuspend_one(td); } } mtx_unlock_spin(&sched_lock); } Index: head/sys/kern/sched_4bsd.c =================================================================== --- head/sys/kern/sched_4bsd.c (revision 116360) +++ head/sys/kern/sched_4bsd.c (revision 116361) @@ -1,715 +1,715 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #define NICE_WEIGHT 1 /* Priorities per nice level. */ struct ke_sched { int ske_cpticks; /* (j) Ticks of cpu time. */ }; static struct ke_sched ke_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = NULL; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = NULL; static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ #define SCHED_QUANTUM (hz / 10) /* Default sched quantum */ static struct callout schedcpu_callout; static struct callout roundrobin_callout; static void roundrobin(void *arg); static void schedcpu(void *arg); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct ksegrp *kg); static void resetpriority(struct ksegrp *kg); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) /* * Global run queue. */ static struct runq runq; SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq) static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val; new_val = sched_quantum * tick; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val < tick) return (EINVAL); sched_quantum = new_val / tick; hogticks = 2 * sched_quantum; return (0); } SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof sched_quantum, sysctl_kern_quantum, "I", "Roundrobin scheduling quantum in microseconds"); /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); if (td->td_priority < curthread->td_priority && curthread->td_kse) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Force switch among equal priority processes every 100ms. * We don't actually need to force a context switch of the current process. * The act of firing the event triggers a context switch to softclock() and * then switching back out again which is equivalent to a preemption, thus * no further work is needed on the local CPU. */ /* ARGSUSED */ static void roundrobin(void *arg) { #ifdef SMP mtx_lock_spin(&sched_lock); forward_roundrobin(); mtx_unlock_spin(&sched_lock); #endif callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL); } /* * Constants for digital decay and forget: * 90% of (p_estcpu) usage in 5 * loadav time * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates p_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * p_estcpu *= decay; * will compute * p_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void *arg) { register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct kse *ke; struct ksegrp *kg; int realstathz; int awake; realstathz = stathz ? stathz : hz; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { mtx_lock_spin(&sched_lock); p->p_swtime++; FOREACH_KSEGRP_IN_PROC(p, kg) { awake = 0; FOREACH_KSE_IN_GROUP(kg, ke) { /* * Increment time in/out of memory and sleep * time (if sleeping). We ignore overflow; * with 16-bit int's (remember them?) * overflow takes 45 days. */ /* * The kse slptimes are not touched in wakeup * because the thread may not HAVE a KSE. */ if (ke->ke_state == KES_ONRUNQ) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } else if ((ke->ke_state == KES_THREAD) && (TD_IS_RUNNING(ke->ke_thread))) { awake = 1; /* Do not clear KEF_DIDRUN */ } else if (ke->ke_flags & KEF_DIDRUN) { awake = 1; ke->ke_flags &= ~KEF_DIDRUN; } /* * pctcpu is only for ps? * Do it per kse.. and add them up at the end? * XXXKSE */ ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >> FSHIFT; /* * If the kse has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ke->ke_sched->ske_cpticks == 0) continue; #if (FSHIFT >= CCPU_SHIFT) ke->ke_pctcpu += (realstathz == 100) ? ((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ke->ke_sched->ske_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ke->ke_pctcpu += ((FSCALE - ccpu) * (ke->ke_sched->ske_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ke->ke_sched->ske_cpticks = 0; } /* end of kse loop */ /* * If there are ANY running threads in this KSEGRP, * then don't count it as sleeping. */ if (awake) { if (kg->kg_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(kg); } kg->kg_slptime = 0; } else { kg->kg_slptime++; } if (kg->kg_slptime > 1) continue; kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); resetpriority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { if (td->td_priority >= PUSER) { sched_prio(td, kg->kg_user_pri); } } } /* end of ksegrp loop */ mtx_unlock_spin(&sched_lock); } /* end of process loop */ sx_sunlock(&allproc_lock); callout_reset(&schedcpu_callout, hz, schedcpu, NULL); } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max p_estcpu of 255, sleeping for at * least six times the loadfactor will decay p_estcpu to zero. */ static void updatepri(struct ksegrp *kg) { register unsigned int newcpu; register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); newcpu = kg->kg_estcpu; if (kg->kg_slptime > 5 * loadfac) kg->kg_estcpu = 0; else { kg->kg_slptime--; /* the first time was done in schedcpu */ while (newcpu && --kg->kg_slptime) newcpu = decay_cpu(loadfac, newcpu); kg->kg_estcpu = newcpu; } resetpriority(kg); } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct ksegrp *kg) { register unsigned int newpriority; struct thread *td; if (kg->kg_pri_class == PRI_TIMESHARE) { newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (kg->kg_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); kg->kg_user_pri = newpriority; } FOREACH_THREAD_IN_GROUP(kg, td) { maybe_resched(td); /* XXXKSE silly */ } } /* ARGSUSED */ static void sched_setup(void *dummy) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; hogticks = 2 * sched_quantum; callout_init(&schedcpu_callout, 1); callout_init(&roundrobin_callout, 0); /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); } /* External interfaces start here */ int sched_runnable(void) { return runq_check(&runq); } int sched_rr_interval(void) { if (sched_quantum == 0) sched_quantum = SCHED_QUANTUM; return (sched_quantum); } /* * We adjust the priority of the current process. The priority of * a process gets worse as it accumulates CPU time. The cpu usage * estimator (p_estcpu) is increased here. resetpriority() will * compute a different priority each time p_estcpu increases by * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). The cpu usage estimator ramps up * quite quickly when the process is running (linearly), and decays * away exponentially, at a rate which is proportionally slower when * the system is busy. The basic principle is that the system will * 90% forget that the process used a lot of CPU time in 5 * loadav * seconds. This causes the system to favor processes which haven't * run much recently, and to round-robin among other processes. */ void sched_clock(struct kse *ke) { struct ksegrp *kg; struct thread *td; mtx_assert(&sched_lock, MA_OWNED); kg = ke->ke_ksegrp; td = ke->ke_thread; ke->ke_sched->ske_cpticks++; kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1); if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(kg); if (td->td_priority >= PUSER) td->td_priority = kg->kg_user_pri; } } /* * charge childs scheduling cpu usage to parent. * * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp. * Charge it to the ksegrp that did the wait since process estcpu is sum of * all ksegrps, this is strictly as expected. Assume that the child process * aggregated all the estcpu into the 'built-in' ksegrp. */ void sched_exit(struct proc *p, struct proc *p1) { sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_exit_kse(struct kse *ke, struct kse *child) { } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu); } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_fork(struct proc *p, struct proc *p1) { sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_sched->ske_cpticks = 0; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { mtx_assert(&sched_lock, MA_OWNED); child->kg_estcpu = kg->kg_estcpu; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_nice(struct ksegrp *kg, int nice) { PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); kg->kg_nice = nice; resetpriority(kg); } void sched_class(struct ksegrp *kg, int class) { mtx_assert(&sched_lock, MA_OWNED); kg->kg_pri_class = class; } /* * Adjust the priority of a thread. * This may include moving the thread within the KSEGRP, * changing the assignment of a kse to the thread, * and moving a KSE in the system run queue. */ void sched_prio(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); if (TD_ON_RUNQ(td)) { adjustrunqueue(td, prio); } else { td->td_priority = prio; } } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_ksegrp->kg_slptime = 0; td->td_priority = prio; } void sched_switchin(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); td->td_oncpu = PCPU_GET(cpuid); } void sched_switchout(struct thread *td) { struct kse *ke; struct proc *p; ke = td->td_kse; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?")); td->td_lastcpu = td->td_oncpu; td->td_last_kse = ke; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. */ if (TD_IS_RUNNING(td)) { /* Put us back on the run queue (kse and all). */ setrunqueue(td); - } else if (p->p_flag & P_THREADED) { + } else if (p->p_flag & P_SA) { /* * We will not be on the run queue. So we must be * sleeping or similar. As it's available, * someone else can use the KSE if they need it. */ kse_reassign(ke); } } void sched_wakeup(struct thread *td) { struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); kg = td->td_ksegrp; if (kg->kg_slptime > 1) updatepri(kg); kg->kg_slptime = 0; setrunqueue(td); maybe_resched(td); } void sched_add(struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("runq_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_add: process swapped out")); ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(&runq, ke); } void sched_rem(struct kse *ke) { KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_remove: process swapped out")); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); mtx_assert(&sched_lock, MA_OWNED); runq_remove(&runq, ke); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; } struct kse * sched_choose(void) { struct kse *ke; ke = runq_choose(&runq); if (ke != NULL) { runq_remove(&runq, ke); ke->ke_state = KES_THREAD; KASSERT((ke->ke_thread != NULL), ("runq_choose: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("runq_choose: No KSE on thread")); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("runq_choose: process swapped out")); } return (ke); } void sched_userret(struct thread *td) { struct ksegrp *kg; /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread)); } fixpt_t sched_pctcpu(struct kse *ke) { return (ke->ke_pctcpu); } Index: head/sys/kern/sched_ule.c =================================================================== --- head/sys/kern/sched_ule.c (revision 116360) +++ head/sys/kern/sched_ule.c (revision 116361) @@ -1,1280 +1,1280 @@ /*- * Copyright (c) 2002-2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifdef KTRACE #include #include #endif #include #define KTR_ULE KTR_NFS /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ /* XXX This is bogus compatability crap for ps */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); static int sched_strict; SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); static int slice_min = 1; SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); static int slice_max = 2; SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); int realstathz; int tickincr = 1; #ifdef SMP /* Callout to handle load balancing SMP systems. */ static struct callout kseq_lb_callout; #endif /* * These datastructures are allocated within their parent datastructure but * are scheduler specific. */ struct ke_sched { int ske_slice; struct runq *ske_runq; /* The following variables are only used for pctcpu calculation */ int ske_ltick; /* Last tick that we were running on */ int ske_ftick; /* First tick that we were running on */ int ske_ticks; /* Tick count */ /* CPU that we have affinity for. */ u_char ske_cpu; }; #define ke_slice ke_sched->ske_slice #define ke_runq ke_sched->ske_runq #define ke_ltick ke_sched->ske_ltick #define ke_ftick ke_sched->ske_ftick #define ke_ticks ke_sched->ske_ticks #define ke_cpu ke_sched->ske_cpu struct kg_sched { int skg_slptime; /* Number of ticks we vol. slept */ int skg_runtime; /* Number of ticks we were running */ }; #define kg_slptime kg_sched->skg_slptime #define kg_runtime kg_sched->skg_runtime struct td_sched { int std_slptime; }; #define td_slptime td_sched->std_slptime struct td_sched td_sched; struct ke_sched ke_sched; struct kg_sched kg_sched; struct ke_sched *kse0_sched = &ke_sched; struct kg_sched *ksegrp0_sched = &kg_sched; struct p_sched *proc0_sched = NULL; struct td_sched *thread0_sched = &td_sched; /* * This priority range has 20 priorities on either end that are reachable * only through nice values. * * PRI_RANGE: Total priority range for timeshare threads. * PRI_NRESV: Reserved priorities for nice. * PRI_BASE: The start of the dynamic range. * DYN_RANGE: Number of priorities that are available int the dynamic * priority range. */ #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define SCHED_PRI_NRESV PRIO_TOTAL #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) #define SCHED_PRI_BASE ((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE) #define SCHED_DYN_RANGE (SCHED_PRI_RANGE - SCHED_PRI_NRESV) #define SCHED_PRI_INTERACT(score) \ ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE) /* * These determine the interactivity of a process. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_THROTTLE: Divisor for reducing slp/run time. * INTERACT_RANGE: Range of interactivity values. Smaller is better. * INTERACT_HALF: Convenience define, half of the interactivity range. * INTERACT_THRESH: Threshhold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz / 10) << 10) #define SCHED_SLP_RUN_THROTTLE (10) #define SCHED_INTERACT_RANGE (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_RANGE / 2) #define SCHED_INTERACT_THRESH (10) /* * These parameters and macros determine the size of the time slice that is * granted to each thread. * * SLICE_MIN: Minimum time slice granted, in units of ticks. * SLICE_MAX: Maximum time slice granted. * SLICE_RANGE: Range of available time slices scaled by hz. * SLICE_SCALE: The number slices granted per val in the range of [0, max]. * SLICE_NICE: Determine the amount of slice granted to a scaled nice. */ #define SCHED_SLICE_MIN (slice_min) #define SCHED_SLICE_MAX (slice_max) #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) #define SCHED_SLICE_NICE(nice) \ (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) /* * This macro determines whether or not the kse belongs on the current or * next run queue. * * XXX nice value should effect how interactive a kg is. */ #define SCHED_INTERACTIVE(kg) \ (sched_interact_score(kg) < SCHED_INTERACT_THRESH) #define SCHED_CURR(kg, ke) \ (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) /* * Cpu percentage computation macros and defines. * * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. */ #define SCHED_CPU_TIME 10 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) /* * kseq - per processor runqs and statistics. */ #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ struct kseq { struct runq ksq_idle; /* Queue of IDLE threads. */ struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ struct runq *ksq_next; /* Next timeshare queue. */ struct runq *ksq_curr; /* Current queue. */ int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ int ksq_load; /* Aggregate load. */ short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ short ksq_nicemin; /* Least nice. */ #ifdef SMP unsigned int ksq_rslices; /* Slices on run queue */ #endif }; /* * One kse queue per processor. */ #ifdef SMP struct kseq kseq_cpu[MAXCPU]; #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (&kseq_cpu[(x)]) #else struct kseq kseq_cpu; #define KSEQ_SELF() (&kseq_cpu) #define KSEQ_CPU(x) (&kseq_cpu) #endif static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); void sched_pctcpu_update(struct kse *ke); int sched_pickcpu(void); /* Operations on per processor queues */ static struct kse * kseq_choose(struct kseq *kseq); static void kseq_setup(struct kseq *kseq); static void kseq_add(struct kseq *kseq, struct kse *ke); static void kseq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); void kseq_print(int cpu); #ifdef SMP struct kseq * kseq_load_highest(void); void kseq_balance(void *arg); void kseq_move(struct kseq *from, int cpu); #endif void kseq_print(int cpu) { struct kseq *kseq; int i; kseq = KSEQ_CPU(cpu); printf("kseq:\n"); printf("\tload: %d\n", kseq->ksq_load); printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); printf("\tnice counts:\n"); for (i = 0; i < PRIO_TOTAL + 1; i++) if (kseq->ksq_nice[i]) printf("\t\t%d = %d\n", i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); } static void kseq_add(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; kseq->ksq_load++; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices += ke->ke_slice; #endif } static void kseq_rem(struct kseq *kseq, struct kse *ke) { mtx_assert(&sched_lock, MA_OWNED); kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; kseq->ksq_load--; ke->ke_runq = NULL; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); #ifdef SMP kseq->ksq_rslices -= ke->ke_slice; #endif } static void kseq_nice_add(struct kseq *kseq, int nice) { mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) kseq->ksq_nicemin = nice; } static void kseq_nice_rem(struct kseq *kseq, int nice) { int n; mtx_assert(&sched_lock, MA_OWNED); /* Normalize to zero. */ n = nice + SCHED_PRI_NHALF; kseq->ksq_nice[n]--; KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); /* * If this wasn't the smallest nice value or there are more in * this bucket we can just return. Otherwise we have to recalculate * the smallest nice. */ if (nice != kseq->ksq_nicemin || kseq->ksq_nice[n] != 0 || kseq->ksq_loads[PRI_TIMESHARE] == 0) return; for (; n < SCHED_PRI_NRESV + 1; n++) if (kseq->ksq_nice[n]) { kseq->ksq_nicemin = n - SCHED_PRI_NHALF; return; } } #ifdef SMP /* * kseq_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load * by migrating some processes. * * Dealing only with two CPUs at a time has two advantages. Firstly, most * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses * algorithm simplicity and more gradual effects on load in larger systems. * * It could be improved by considering the priorities and slices assigned to * each task prior to balancing them. There are many pathological cases with * any approach and so the semi random algorithm below may work as well as any. * */ void kseq_balance(void *arg) { struct kseq *kseq; int high_load; int low_load; int high_cpu; int low_cpu; int move; int diff; int i; high_cpu = 0; low_cpu = 0; high_load = 0; low_load = -1; mtx_lock_spin(&sched_lock); for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > high_load) { high_load = kseq->ksq_load; high_cpu = i; } if (low_load == -1 || kseq->ksq_load < low_load) { low_load = kseq->ksq_load; low_cpu = i; } } /* * Nothing to do. */ if (high_load < 2 || low_load == high_load) goto out; diff = high_load - low_load; move = diff / 2; if (diff & 0x1) move++; for (i = 0; i < move; i++) kseq_move(KSEQ_CPU(high_cpu), low_cpu); out: mtx_unlock_spin(&sched_lock); callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); return; } struct kseq * kseq_load_highest(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); cpu = 0; load = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > load) { load = kseq->ksq_load; cpu = i; } } if (load > 1) return (KSEQ_CPU(cpu)); return (NULL); } void kseq_move(struct kseq *from, int cpu) { struct kse *ke; ke = kseq_choose(from); runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; kseq_rem(from, ke); ke->ke_cpu = cpu; sched_add(ke); } #endif struct kse * kseq_choose(struct kseq *kseq) { struct kse *ke; struct runq *swap; mtx_assert(&sched_lock, MA_OWNED); swap = NULL; for (;;) { ke = runq_choose(kseq->ksq_curr); if (ke == NULL) { /* * We already swaped once and didn't get anywhere. */ if (swap) break; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; continue; } /* * If we encounter a slice of 0 the kse is in a * TIMESHARE kse group and its nice was too far out * of the range that receives slices. */ if (ke->ke_slice == 0) { runq_remove(ke->ke_runq, ke); sched_slice(ke); ke->ke_runq = kseq->ksq_next; runq_add(ke->ke_runq, ke); continue; } return (ke); } return (runq_choose(&kseq->ksq_idle)); } static void kseq_setup(struct kseq *kseq) { runq_init(&kseq->ksq_timeshare[0]); runq_init(&kseq->ksq_timeshare[1]); runq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; kseq->ksq_loads[PRI_ITHD] = 0; kseq->ksq_loads[PRI_REALTIME] = 0; kseq->ksq_loads[PRI_TIMESHARE] = 0; kseq->ksq_loads[PRI_IDLE] = 0; kseq->ksq_load = 0; #ifdef SMP kseq->ksq_rslices = 0; #endif } static void sched_setup(void *dummy) { int i; slice_min = (hz/100); slice_max = (hz/10); mtx_lock_spin(&sched_lock); /* init kseqs */ for (i = 0; i < MAXCPU; i++) kseq_setup(KSEQ_CPU(i)); kseq_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); #ifdef SMP callout_init(&kseq_lb_callout, 1); kseq_balance(NULL); #endif } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct ksegrp *kg) { int pri; if (kg->kg_pri_class != PRI_TIMESHARE) return; pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); pri += SCHED_PRI_BASE; pri += kg->kg_nice; if (pri > PRI_MAX_TIMESHARE) pri = PRI_MAX_TIMESHARE; else if (pri < PRI_MIN_TIMESHARE) pri = PRI_MIN_TIMESHARE; kg->kg_user_pri = pri; return; } /* * Calculate a time slice based on the properties of the kseg and the runq * that we're on. This is only for PRI_TIMESHARE ksegrps. */ static void sched_slice(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; kg = ke->ke_ksegrp; kseq = KSEQ_CPU(ke->ke_cpu); /* * Rationale: * KSEs in interactive ksegs get the minimum slice so that we * quickly notice if it abuses its advantage. * * KSEs in non-interactive ksegs are assigned a slice that is * based on the ksegs nice value relative to the least nice kseg * on the run queue for this cpu. * * If the KSE is less nice than all others it gets the maximum * slice and other KSEs will adjust their slice relative to * this when they first expire. * * There is 20 point window that starts relative to the least * nice kse on the run queue. Slice size is determined by * the kse distance from the last nice ksegrp. * * If you are outside of the window you will get no slice and * you will be reevaluated each time you are selected on the * run queue. * */ if (!SCHED_INTERACTIVE(kg)) { int nice; nice = kg->kg_nice + (0 - kseq->ksq_nicemin); if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || kg->kg_nice < kseq->ksq_nicemin) ke->ke_slice = SCHED_SLICE_MAX; else if (nice <= SCHED_PRI_NTHRESH) ke->ke_slice = SCHED_SLICE_NICE(nice); else ke->ke_slice = 0; } else ke->ke_slice = SCHED_SLICE_MIN; CTR6(KTR_ULE, "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); /* * Check to see if we need to scale back the slp and run time * in the kg. This will cause us to forget old interactivity * while maintaining the current ratio. */ CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)", ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10, sched_interact_score(kg)); if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE; kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE; } CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)", ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10, sched_interact_score(kg)); return; } static int sched_interact_score(struct ksegrp *kg) { int big; int small; int base; if (kg->kg_runtime > kg->kg_slptime) { big = kg->kg_runtime; small = kg->kg_slptime; base = SCHED_INTERACT_HALF; } else { big = kg->kg_slptime; small = kg->kg_runtime; base = 0; } big /= SCHED_INTERACT_HALF; if (big != 0) small /= big; else small = 0; small += base; /* XXX Factor in nice */ return (small); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most SCHED_SLICE_MAX. */ int sched_rr_interval(void) { return (SCHED_SLICE_MAX); } void sched_pctcpu_update(struct kse *ke) { /* * Adjust counters and watermark for pctcpu calc. * * Shift the tick count out so that the divide doesn't round away * our results. */ ke->ke_ticks <<= 10; ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * SCHED_CPU_TICKS; ke->ke_ticks >>= 10; ke->ke_ltick = ticks; ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } #ifdef SMP /* XXX Should be changed to kseq_load_lowest() */ int sched_pickcpu(void) { struct kseq *kseq; int load; int cpu; int i; mtx_assert(&sched_lock, MA_OWNED); if (!smp_started) return (0); load = 0; cpu = 0; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load < load) { cpu = i; load = kseq->ksq_load; } } CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); return (cpu); } #else int sched_pickcpu(void) { return (0); } #endif void sched_prio(struct thread *td, u_char prio) { struct kse *ke; struct runq *rq; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_priority = prio; if (TD_ON_RUNQ(td)) { rq = ke->ke_runq; runq_remove(rq, ke); runq_add(rq, ke); } } void sched_switchout(struct thread *td) { struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; td->td_last_kse = ke; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; td->td_flags &= ~TDF_NEEDRESCHED; if (TD_IS_RUNNING(td)) { runq_add(ke->ke_runq, ke); /* setrunqueue(td); */ return; } if (ke->ke_runq) kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); /* * We will not be on the run queue. So we must be * sleeping or similar. */ - if (td->td_proc->p_flag & P_THREADED) + if (td->td_proc->p_flag & P_SA) kse_reassign(ke); } void sched_switchin(struct thread *td) { /* struct kse *ke = td->td_kse; */ mtx_assert(&sched_lock, MA_OWNED); td->td_oncpu = PCPU_GET(cpuid); if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && td->td_priority != td->td_ksegrp->kg_user_pri) curthread->td_flags |= TDF_NEEDRESCHED; } void sched_nice(struct ksegrp *kg, int nice) { struct kse *ke; struct thread *td; struct kseq *kseq; PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); /* * We need to adjust the nice counts for running KSEs. */ if (kg->kg_pri_class == PRI_TIMESHARE) FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq_nice_rem(kseq, kg->kg_nice); kseq_nice_add(kseq, nice); } kg->kg_nice = nice; sched_priority(kg); FOREACH_THREAD_IN_GROUP(kg, td) td->td_flags |= TDF_NEEDRESCHED; } void sched_sleep(struct thread *td, u_char prio) { mtx_assert(&sched_lock, MA_OWNED); td->td_slptime = ticks; td->td_priority = prio; CTR2(KTR_ULE, "sleep kse %p (tick: %d)", td->td_kse, td->td_slptime); } void sched_wakeup(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); /* * Let the kseg know how long we slept for. This is because process * interactivity behavior is modeled in the kseg. */ if (td->td_slptime) { struct ksegrp *kg; int hzticks; kg = td->td_ksegrp; hzticks = ticks - td->td_slptime; kg->kg_slptime += hzticks << 10; sched_priority(kg); CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", td->td_kse, hzticks); td->td_slptime = 0; } setrunqueue(td); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct proc *p, struct proc *p1) { mtx_assert(&sched_lock, MA_OWNED); sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); } void sched_fork_kse(struct kse *ke, struct kse *child) { child->ke_slice = ke->ke_slice; child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ child->ke_runq = NULL; /* * Claim that we've been running for one second for statistical * purposes. */ child->ke_ticks = 0; child->ke_ltick = ticks; child->ke_ftick = ticks - hz; } void sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) { PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); /* XXX Need something better here */ if (kg->kg_slptime > kg->kg_runtime) { child->kg_slptime = SCHED_DYN_RANGE; child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE; } else { child->kg_runtime = SCHED_DYN_RANGE; child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE; } child->kg_user_pri = kg->kg_user_pri; child->kg_nice = kg->kg_nice; } void sched_fork_thread(struct thread *td, struct thread *child) { } void sched_class(struct ksegrp *kg, int class) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); if (kg->kg_pri_class == class) return; FOREACH_KSE_IN_GROUP(kg, ke) { if (ke->ke_state != KES_ONRUNQ && ke->ke_state != KES_THREAD) continue; kseq = KSEQ_CPU(ke->ke_cpu); kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; kseq->ksq_loads[PRI_BASE(class)]++; if (kg->kg_pri_class == PRI_TIMESHARE) kseq_nice_rem(kseq, kg->kg_nice); else if (class == PRI_TIMESHARE) kseq_nice_add(kseq, kg->kg_nice); } kg->kg_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct proc *child) { /* XXX Need something better here */ mtx_assert(&sched_lock, MA_OWNED); sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); } void sched_exit_kse(struct kse *ke, struct kse *child) { kseq_rem(KSEQ_CPU(child->ke_cpu), child); } void sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) { } void sched_exit_thread(struct thread *td, struct thread *child) { } void sched_clock(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; struct thread *td; #if 0 struct kse *nke; #endif /* * sched_setup() apparently happens prior to stathz being set. We * need to resolve the timers earlier in the boot so we can avoid * calculating this here. */ if (realstathz == 0) { realstathz = stathz ? stathz : hz; tickincr = hz / realstathz; /* * XXX This does not work for values of stathz that are much * larger than hz. */ if (tickincr == 0) tickincr = 1; } td = ke->ke_thread; kg = ke->ke_ksegrp; mtx_assert(&sched_lock, MA_OWNED); KASSERT((td != NULL), ("schedclock: null thread pointer")); /* Adjust ticks for pctcpu */ ke->ke_ticks++; ke->ke_ltick = ticks; /* Go up to one second beyond our max and then trim back down */ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) sched_pctcpu_update(ke); if (td->td_flags & TDF_IDLETD) return; CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); /* * We only do slicing code for TIMESHARE ksegrps. */ if (kg->kg_pri_class != PRI_TIMESHARE) return; /* * Check for a higher priority task on the run queue. This can happen * on SMP if another processor woke up a process on our runq. */ kseq = KSEQ_SELF(); #if 0 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) { if (sched_strict && nke->ke_thread->td_priority < td->td_priority) td->td_flags |= TDF_NEEDRESCHED; else if (nke->ke_thread->td_priority < td->td_priority SCHED_PRIO_SLOP) if (nke->ke_thread->td_priority < td->td_priority) td->td_flags |= TDF_NEEDRESCHED; } #endif /* * We used a tick charge it to the ksegrp so that we can compute our * interactivity. */ kg->kg_runtime += tickincr << 10; /* * We used up one time slice. */ ke->ke_slice--; #ifdef SMP kseq->ksq_rslices--; #endif if (ke->ke_slice > 0) return; /* * We're out of time, recompute priorities and requeue. */ kseq_rem(kseq, ke); sched_priority(kg); sched_slice(ke); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; kseq_add(kseq, ke); td->td_flags |= TDF_NEEDRESCHED; } int sched_runnable(void) { struct kseq *kseq; int load; load = 1; mtx_lock_spin(&sched_lock); kseq = KSEQ_SELF(); if (kseq->ksq_load) goto out; #ifdef SMP /* * For SMP we may steal other processor's KSEs. Just search until we * verify that at least on other cpu has a runnable task. */ if (smp_started) { int i; for (i = 0; i < mp_maxid; i++) { if (CPU_ABSENT(i)) continue; kseq = KSEQ_CPU(i); if (kseq->ksq_load > 1) goto out; } } #endif load = 0; out: mtx_unlock_spin(&sched_lock); return (load); } void sched_userret(struct thread *td) { struct ksegrp *kg; kg = td->td_ksegrp; if (td->td_priority != kg->kg_user_pri) { mtx_lock_spin(&sched_lock); td->td_priority = kg->kg_user_pri; mtx_unlock_spin(&sched_lock); } } struct kse * sched_choose(void) { struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP retry: #endif kseq = KSEQ_SELF(); ke = kseq_choose(kseq); if (ke) { runq_remove(ke->ke_runq, ke); ke->ke_state = KES_THREAD; if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority); } return (ke); } #ifdef SMP if (smp_started) { /* * Find the cpu with the highest load and steal one proc. */ if ((kseq = kseq_load_highest()) == NULL) return (NULL); /* * Remove this kse from this kseq and runq and then requeue * on the current processor. Then we will dequeue it * normally above. */ kseq_move(kseq, PCPU_GET(cpuid)); goto retry; } #endif return (NULL); } void sched_add(struct kse *ke) { struct kseq *kseq; struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); KASSERT((ke->ke_thread->td_kse != NULL), ("sched_add: No KSE on thread")); KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); KASSERT(ke->ke_proc->p_sflag & PS_INMEM, ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); kg = ke->ke_ksegrp; switch (PRI_BASE(kg->kg_pri_class)) { case PRI_ITHD: case PRI_REALTIME: kseq = KSEQ_SELF(); ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: kseq = KSEQ_CPU(ke->ke_cpu); if (SCHED_CURR(kg, ke)) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = kseq->ksq_next; break; case PRI_IDLE: kseq = KSEQ_CPU(ke->ke_cpu); /* * This is for priority prop. */ if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE) ke->ke_runq = kseq->ksq_curr; else ke->ke_runq = &kseq->ksq_idle; ke->ke_slice = SCHED_SLICE_MIN; break; default: panic("Unknown pri class.\n"); break; } ke->ke_ksegrp->kg_runq_kses++; ke->ke_state = KES_ONRUNQ; runq_add(ke->ke_runq, ke); kseq_add(kseq, ke); } void sched_rem(struct kse *ke) { struct kseq *kseq; mtx_assert(&sched_lock, MA_OWNED); KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); ke->ke_state = KES_THREAD; ke->ke_ksegrp->kg_runq_kses--; kseq = KSEQ_CPU(ke->ke_cpu); runq_remove(ke->ke_runq, ke); kseq_rem(kseq, ke); } fixpt_t sched_pctcpu(struct kse *ke) { fixpt_t pctcpu; pctcpu = 0; mtx_lock_spin(&sched_lock); if (ke->ke_ticks) { int rtick; /* Update to account for time potentially spent sleeping */ ke->ke_ltick = ticks; sched_pctcpu_update(ke); /* How many rtick per second ? */ rtick = ke->ke_ticks / SCHED_CPU_TIME; pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; } ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; mtx_unlock_spin(&sched_lock); return (pctcpu); } int sched_sizeof_kse(void) { return (sizeof(struct kse) + sizeof(struct ke_sched)); } int sched_sizeof_ksegrp(void) { return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 116360) +++ head/sys/kern/subr_trap.c (revision 116361) @@ -1,272 +1,272 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #ifdef __i386__ #include "opt_npx.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Define the code needed before returning to user mode, for * trap and syscall. * * MPSAFE */ void userret(td, frame, oticks) struct thread *td; struct trapframe *frame; u_int oticks; { struct proc *p = td->td_proc; CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); #ifdef INVARIANTS /* Check that we called signotify() enough. */ PROC_LOCK(p); mtx_lock_spin(&sched_lock); if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 || (td->td_flags & TDF_ASTPENDING) == 0)) printf("failed to set signal flags properly for ast()\n"); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); #endif /* * Let the scheduler adjust our priority etc. */ sched_userret(td); /* * We need to check to see if we have to exit or wait due to a * single threading requirement or some other STOP condition. * Don't bother doing all the work if the stop bits are not set * at this time.. If we miss it, we miss it.. no big deal. */ if (P_SHOULDSTOP(p)) { PROC_LOCK(p); thread_suspend_check(0); /* Can suspend or kill */ PROC_UNLOCK(p); } /* * Do special thread processing, e.g. upcall tweaking and such. */ - if (p->p_flag & P_THREADED) { + if (p->p_flag & P_SA) { thread_userret(td, frame); } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { quad_t ticks; mtx_lock_spin(&sched_lock); ticks = td->td_sticks - oticks; mtx_unlock_spin(&sched_lock); addupc_task(td, TRAPF_PC(frame), (u_int)ticks * psratio); } } /* * Process an asynchronous software trap. * This is relatively easy. * This function will return with preemption disabled. */ void ast(struct trapframe *framep) { struct thread *td; struct proc *p; struct kse *ke; struct ksegrp *kg; struct rlimit *rlim; u_int prticks, sticks; int sflag; int flags; int sig; #if defined(DEV_NPX) && !defined(SMP) int ucode; #endif td = curthread; p = td->td_proc; kg = td->td_ksegrp; CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); mtx_assert(&sched_lock, MA_NOTOWNED); td->td_frame = framep; /* * This updates the p_sflag's for the checks below in one * "atomic" operation with turning off the astpending flag. * If another AST is triggered while we are handling the * AST's saved in sflag, the astpending flag will be set and * ast() will be called again. */ mtx_lock_spin(&sched_lock); ke = td->td_kse; sticks = td->td_sticks; flags = td->td_flags; sflag = p->p_sflag; p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU); #ifdef MAC p->p_sflag &= ~PS_MACPEND; #endif td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDRESCHED | TDF_OWEUPC); cnt.v_soft++; prticks = 0; if (flags & TDF_OWEUPC && p->p_flag & P_PROFIL) { prticks = p->p_stats->p_prof.pr_ticks; p->p_stats->p_prof.pr_ticks = 0; } mtx_unlock_spin(&sched_lock); /* * XXXKSE While the fact that we owe a user profiling * tick is stored per KSE in this code, the statistics * themselves are still stored per process. * This should probably change, by which I mean that * possibly the location of both might change. */ if (td->td_ucred != p->p_ucred) cred_update_thread(td); if (flags & TDF_OWEUPC && p->p_flag & P_PROFIL) addupc_task(td, p->p_stats->p_prof.pr_addr, prticks); if (sflag & PS_ALRMPEND) { PROC_LOCK(p); psignal(p, SIGVTALRM); PROC_UNLOCK(p); } #if defined(DEV_NPX) && !defined(SMP) if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) { atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags, PCB_NPXTRAP); ucode = npxtrap(); if (ucode != -1) { trapsignal(td, SIGFPE, ucode); } } #endif if (sflag & PS_PROFPEND) { PROC_LOCK(p); psignal(p, SIGPROF); PROC_UNLOCK(p); } if (sflag & PS_XCPU) { PROC_LOCK(p); rlim = &p->p_rlimit[RLIMIT_CPU]; mtx_lock_spin(&sched_lock); if (p->p_runtime.sec >= rlim->rlim_max) { mtx_unlock_spin(&sched_lock); killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim->rlim_max) p->p_cpulimit += 5; mtx_unlock_spin(&sched_lock); psignal(p, SIGXCPU); } PROC_UNLOCK(p); } #ifdef MAC if (sflag & PS_MACPEND) mac_thread_userret(td); #endif if (flags & TDF_NEEDRESCHED) { mtx_lock_spin(&sched_lock); sched_prio(td, kg->kg_user_pri); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); } if (flags & TDF_NEEDSIGCHK) { int sigs; sigs = 0; PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { postsig(sig); sigs++; } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); - if (p->p_flag & P_THREADED && sigs) { + if (p->p_flag & P_SA && sigs) { struct kse_upcall *ku = td->td_upcall; if ((void *)TRAPF_PC(framep) != ku->ku_func) { mtx_lock_spin(&sched_lock); ku->ku_flags |= KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); } } } userret(td, framep, sticks); #ifdef DIAGNOSTIC cred_free_thread(td); #endif mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/kern/tty.c =================================================================== --- head/sys/kern/tty.c (revision 116360) +++ head/sys/kern/tty.c (revision 116361) @@ -1,2723 +1,2723 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2002 Networks Associates Technologies, Inc. * All rights reserved. * * Portions of this software were developed for the FreeBSD Project by * ThinkSec AS and NAI Labs, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 * ("CBOSS"), as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tty.c 8.8 (Berkeley) 1/21/94 */ /*- * TODO: * o Fix races for sending the start char in ttyflush(). * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect(). * With luck, there will be MIN chars before select() returns(). * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it. * o Don't allow input in TS_ZOMBIE case. It would be visible through * FIONREAD. * o Do the new sio locking stuff here and use it to avoid special * case for EXTPROC? * o Lock PENDIN too? * o Move EXTPROC and/or PENDIN to t_state? * o Wrap most of ttioctl in spltty/splx. * o Implement TIOCNOTTY or remove it from . * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set. * o Don't allow certain termios flags to affect disciplines other * than TTYDISC. Cancel their effects before switch disciplines * and ignore them if they are set while we are in another * discipline. * o Now that historical speed conversions are handled here, don't * do them in drivers. * o Check for TS_CARR_ON being set while everything is closed and not * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open, * so it would live until the next open even if carrier drops. * o Restore TS_WOPEN since it is useful in pstat. It must be cleared * only when _all_ openers leave open(). */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_tty.h" #include #include #include #include #include #include #include #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #include #endif #include #define TTYDEFCHARS #include #undef TTYDEFCHARS #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures"); long tk_cancc; long tk_nin; long tk_nout; long tk_rawcc; static int proc_compare(struct proc *p1, struct proc *p2); static int ttnread(struct tty *tp); static void ttyecho(int c, struct tty *tp); static int ttyoutput(int c, struct tty *tp); static void ttypend(struct tty *tp); static void ttyretype(struct tty *tp); static void ttyrub(int c, struct tty *tp); static void ttyrubo(struct tty *tp, int cnt); static void ttyunblock(struct tty *tp); static int ttywflush(struct tty *tp); static int filt_ttyread(struct knote *kn, long hint); static void filt_ttyrdetach(struct knote *kn); static int filt_ttywrite(struct knote *kn, long hint); static void filt_ttywdetach(struct knote *kn); /* * Table with character classes and parity. The 8th bit indicates parity, * the 7th bit indicates the character is an alphameric or underscore (for * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits * are 0 then the character needs no special processing on output; classes * other than 0 might be translated or (not currently) require delays. */ #define E 0x00 /* Even parity. */ #define O 0x80 /* Odd parity. */ #define PARITY(c) (char_type[c] & O) #define ALPHA 0x40 /* Alpha or underscore. */ #define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) #define CCLASSMASK 0x3f #define CCLASS(c) (char_type[c] & CCLASSMASK) #define BS BACKSPACE #define CC CONTROL #define CR RETURN #define NA ORDINARY | ALPHA #define NL NEWLINE #define NO ORDINARY #define TB TAB #define VT VTAB static u_char const char_type[] = { E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ /* * Meta chars; should be settable per character set; * for now, treat them all as normal characters. */ NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, }; #undef BS #undef CC #undef CR #undef NA #undef NL #undef NO #undef TB #undef VT /* Macros to clear/set/test flags. */ #define SET(t, f) (t) |= (f) #define CLR(t, f) (t) &= ~(f) #define ISSET(t, f) ((t) & (f)) #undef MAX_INPUT /* XXX wrong in */ #define MAX_INPUT TTYHOG /* XXX limit is usually larger for !ICANON */ /* * list of struct tty where pstat(8) can pick it up with sysctl */ static SLIST_HEAD(, tty) tty_list; static int drainwait = 5*60; SYSCTL_INT(_kern, OID_AUTO, drainwait, CTLFLAG_RW, &drainwait, 0, "Output drain timeout in seconds"); /* * Initial open of tty, or (re)entry to standard tty line discipline. */ int ttyopen(dev_t device, struct tty *tp) { int s; s = spltty(); tp->t_dev = device; if (!ISSET(tp->t_state, TS_ISOPEN)) { SET(tp->t_state, TS_ISOPEN); if (ISSET(tp->t_cflag, CLOCAL)) SET(tp->t_state, TS_CONNECTED); bzero(&tp->t_winsize, sizeof(tp->t_winsize)); } /* XXX don't hang forever on output */ if (tp->t_timeout < 0) tp->t_timeout = drainwait*hz; ttsetwater(tp); splx(s); return (0); } /* * Handle close() on a tty line: flush and set to initial state, * bumping generation number so that pending read/write calls * can detect recycling of the tty. * XXX our caller should have done `spltty(); l_close(); ttyclose();' * and l_close() should have flushed, but we repeat the spltty() and * the flush in case there are buggy callers. */ int ttyclose(struct tty *tp) { int s; funsetown(&tp->t_sigio); s = spltty(); if (constty == tp) constty = NULL; ttyflush(tp, FREAD | FWRITE); clist_free_cblocks(&tp->t_canq); clist_free_cblocks(&tp->t_outq); clist_free_cblocks(&tp->t_rawq); tp->t_gen++; tp->t_line = TTYDISC; tp->t_pgrp = NULL; tp->t_session = NULL; tp->t_state = 0; splx(s); return (0); } #define FLUSHQ(q) { \ if ((q)->c_cc) \ ndflush(q, (q)->c_cc); \ } /* Is 'c' a line delimiter ("break" character)? */ #define TTBREAKC(c, lflag) \ ((c) == '\n' || (((c) == cc[VEOF] || \ (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \ (c) != _POSIX_VDISABLE)) /* * Process input of a single character received on a tty. */ int ttyinput(int c, struct tty *tp) { tcflag_t iflag, lflag; cc_t *cc; int i, err; /* * If input is pending take it first. */ lflag = tp->t_lflag; if (ISSET(lflag, PENDIN)) ttypend(tp); /* * Gather stats. */ if (ISSET(lflag, ICANON)) { ++tk_cancc; ++tp->t_cancc; } else { ++tk_rawcc; ++tp->t_rawcc; } ++tk_nin; /* * Block further input iff: * current input > threshold AND input is available to user program * AND input flow control is enabled and not yet invoked. * The 3 is slop for PARMRK. */ iflag = tp->t_iflag; if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 && (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) && (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) && !ISSET(tp->t_state, TS_TBLOCK)) ttyblock(tp); /* Handle exceptional conditions (break, parity, framing). */ cc = tp->t_cc; err = (ISSET(c, TTY_ERRORMASK)); if (err) { CLR(c, TTY_ERRORMASK); if (ISSET(err, TTY_BI)) { if (ISSET(iflag, IGNBRK)) return (0); if (ISSET(iflag, BRKINT)) { ttyflush(tp, FREAD | FWRITE); if (tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, SIGINT, 1); PGRP_UNLOCK(tp->t_pgrp); } goto endcase; } if (ISSET(iflag, PARMRK)) goto parmrk; } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK)) || ISSET(err, TTY_FE)) { if (ISSET(iflag, IGNPAR)) return (0); else if (ISSET(iflag, PARMRK)) { parmrk: if (tp->t_rawq.c_cc + tp->t_canq.c_cc > MAX_INPUT - 3) goto input_overflow; (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); (void)putc(0 | TTY_QUOTE, &tp->t_rawq); (void)putc(c | TTY_QUOTE, &tp->t_rawq); goto endcase; } else c = 0; } } if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) CLR(c, 0x80); if (!ISSET(lflag, EXTPROC)) { /* * Check for literal nexting very first */ if (ISSET(tp->t_state, TS_LNCH)) { SET(c, TTY_QUOTE); CLR(tp->t_state, TS_LNCH); } /* * Scan for special characters. This code * is really just a big case statement with * non-constant cases. The bottom of the * case statement is labeled ``endcase'', so goto * it after a case match, or similar. */ /* * Control chars which aren't controlled * by ICANON, ISIG, or IXON. */ if (ISSET(lflag, IEXTEN)) { if (CCEQ(cc[VLNEXT], c)) { if (ISSET(lflag, ECHO)) { if (ISSET(lflag, ECHOE)) { (void)ttyoutput('^', tp); (void)ttyoutput('\b', tp); } else ttyecho(c, tp); } SET(tp->t_state, TS_LNCH); goto endcase; } if (CCEQ(cc[VDISCARD], c)) { if (ISSET(lflag, FLUSHO)) CLR(tp->t_lflag, FLUSHO); else { ttyflush(tp, FWRITE); ttyecho(c, tp); if (tp->t_rawq.c_cc + tp->t_canq.c_cc) ttyretype(tp); SET(tp->t_lflag, FLUSHO); } goto startoutput; } } /* * Signals. */ if (ISSET(lflag, ISIG)) { if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD | FWRITE); ttyecho(c, tp); if (tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1); PGRP_UNLOCK(tp->t_pgrp); } goto endcase; } if (CCEQ(cc[VSUSP], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD); ttyecho(c, tp); if (tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, SIGTSTP, 1); PGRP_UNLOCK(tp->t_pgrp); } goto endcase; } } /* * Handle start/stop characters. */ if (ISSET(iflag, IXON)) { if (CCEQ(cc[VSTOP], c)) { if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP); (*tp->t_stop)(tp, 0); return (0); } if (!CCEQ(cc[VSTART], c)) return (0); /* * if VSTART == VSTOP then toggle */ goto endcase; } if (CCEQ(cc[VSTART], c)) goto restartoutput; } /* * IGNCR, ICRNL, & INLCR */ if (c == '\r') { if (ISSET(iflag, IGNCR)) return (0); else if (ISSET(iflag, ICRNL)) c = '\n'; } else if (c == '\n' && ISSET(iflag, INLCR)) c = '\r'; } if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) { /* * From here on down canonical mode character * processing takes place. */ /* * erase or erase2 (^H / ^?) */ if (CCEQ(cc[VERASE], c) || CCEQ(cc[VERASE2], c) ) { if (tp->t_rawq.c_cc) ttyrub(unputc(&tp->t_rawq), tp); goto endcase; } /* * kill (^U) */ if (CCEQ(cc[VKILL], c)) { if (ISSET(lflag, ECHOKE) && tp->t_rawq.c_cc == tp->t_rocount && !ISSET(lflag, ECHOPRT)) while (tp->t_rawq.c_cc) ttyrub(unputc(&tp->t_rawq), tp); else { ttyecho(c, tp); if (ISSET(lflag, ECHOK) || ISSET(lflag, ECHOKE)) ttyecho('\n', tp); FLUSHQ(&tp->t_rawq); tp->t_rocount = 0; } CLR(tp->t_state, TS_LOCAL); goto endcase; } /* * word erase (^W) */ if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) { int ctype; /* * erase whitespace */ while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') ttyrub(c, tp); if (c == -1) goto endcase; /* * erase last char of word and remember the * next chars type (for ALTWERASE) */ ttyrub(c, tp); c = unputc(&tp->t_rawq); if (c == -1) goto endcase; if (c == ' ' || c == '\t') { (void)putc(c, &tp->t_rawq); goto endcase; } ctype = ISALPHA(c); /* * erase rest of word */ do { ttyrub(c, tp); c = unputc(&tp->t_rawq); if (c == -1) goto endcase; } while (c != ' ' && c != '\t' && (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype)); (void)putc(c, &tp->t_rawq); goto endcase; } /* * reprint line (^R) */ if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) { ttyretype(tp); goto endcase; } /* * ^T - kernel info and generate SIGINFO */ if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) { if (ISSET(lflag, ISIG) && tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, SIGINFO, 1); PGRP_UNLOCK(tp->t_pgrp); } if (!ISSET(lflag, NOKERNINFO)) ttyinfo(tp); goto endcase; } } /* * Check for input buffer overflow */ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) { input_overflow: if (ISSET(iflag, IMAXBEL)) { if (tp->t_outq.c_cc < tp->t_ohiwat) (void)ttyoutput(CTRL('g'), tp); } goto endcase; } if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP) && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR)) (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); /* * Put data char in q for user and * wakeup on seeing a line delimiter. */ if (putc(c, &tp->t_rawq) >= 0) { if (!ISSET(lflag, ICANON)) { ttwakeup(tp); ttyecho(c, tp); goto endcase; } if (TTBREAKC(c, lflag)) { tp->t_rocount = 0; catq(&tp->t_rawq, &tp->t_canq); ttwakeup(tp); } else if (tp->t_rocount++ == 0) tp->t_rocol = tp->t_column; if (ISSET(tp->t_state, TS_ERASE)) { /* * end of prterase \.../ */ CLR(tp->t_state, TS_ERASE); (void)ttyoutput('/', tp); } i = tp->t_column; ttyecho(c, tp); if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { /* * Place the cursor over the '^' of the ^D. */ i = imin(2, tp->t_column - i); while (i > 0) { (void)ttyoutput('\b', tp); i--; } } } endcase: /* * IXANY means allow any character to restart output. */ if (ISSET(tp->t_state, TS_TTSTOP) && !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) return (0); restartoutput: CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_TTSTOP); startoutput: return (ttstart(tp)); } /* * Output a single character on a tty, doing output processing * as needed (expanding tabs, newline processing, etc.). * Returns < 0 if succeeds, otherwise returns char to resend. * Must be recursive. */ static int ttyoutput(int c, struct tty *tp) { tcflag_t oflag; int col, s; oflag = tp->t_oflag; if (!ISSET(oflag, OPOST)) { if (ISSET(tp->t_lflag, FLUSHO)) return (-1); if (putc(c, &tp->t_outq)) return (c); tk_nout++; tp->t_outcc++; return (-1); } /* * Do tab expansion if OXTABS is set. Special case if we external * processing, we don't do the tab expansion because we'll probably * get it wrong. If tab expansion needs to be done, let it happen * externally. */ CLR(c, ~TTY_CHARMASK); if (c == '\t' && ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { c = 8 - (tp->t_column & 7); if (!ISSET(tp->t_lflag, FLUSHO)) { s = spltty(); /* Don't interrupt tabs. */ c -= b_to_q(" ", c, &tp->t_outq); tk_nout += c; tp->t_outcc += c; splx(s); } tp->t_column += c; return (c ? -1 : '\t'); } if (c == CEOT && ISSET(oflag, ONOEOT)) return (-1); /* * Newline translation: if ONLCR is set, * translate newline into "\r\n". */ if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { tk_nout++; tp->t_outcc++; if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq)) return (c); } /* If OCRNL is set, translate "\r" into "\n". */ else if (c == '\r' && ISSET(tp->t_oflag, OCRNL)) c = '\n'; /* If ONOCR is set, don't transmit CRs when on column 0. */ else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0) return (-1); tk_nout++; tp->t_outcc++; if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) return (c); col = tp->t_column; switch (CCLASS(c)) { case BACKSPACE: if (col > 0) --col; break; case CONTROL: break; case NEWLINE: if (ISSET(tp->t_oflag, ONLCR | ONLRET)) col = 0; break; case RETURN: col = 0; break; case ORDINARY: ++col; break; case TAB: col = (col + 8) & ~7; break; } tp->t_column = col; return (-1); } /* * Ioctls for all tty devices. Called after line-discipline specific ioctl * has been called to do discipline-specific functions and/or reject any * of these ioctl commands. */ /* ARGSUSED */ int ttioctl(struct tty *tp, u_long cmd, void *data, int flag) { struct proc *p; struct thread *td; struct pgrp *pgrp; int s, error; td = curthread; /* XXX */ p = td->td_proc; /* If the ioctl involves modification, hang if in the background. */ switch (cmd) { case TIOCCBRK: case TIOCCONS: case TIOCDRAIN: case TIOCEXCL: case TIOCFLUSH: #ifdef TIOCHPCL case TIOCHPCL: #endif case TIOCNXCL: case TIOCSBRK: case TIOCSCTTY: case TIOCSDRAINWAIT: case TIOCSETA: case TIOCSETAF: case TIOCSETAW: case TIOCSETD: case TIOCSPGRP: case TIOCSTART: case TIOCSTAT: case TIOCSTI: case TIOCSTOP: case TIOCSWINSZ: #if defined(COMPAT_43) || defined(COMPAT_SUNOS) case TIOCLBIC: case TIOCLBIS: case TIOCLSET: case TIOCSETC: case OTIOCSETD: case TIOCSETN: case TIOCSETP: case TIOCSLTC: #endif sx_slock(&proctree_lock); PROC_LOCK(p); while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) && !SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTOU) && !SIGISMEMBER(td->td_sigmask, SIGTTOU)) { pgrp = p->p_pgrp; PROC_UNLOCK(p); if (pgrp->pg_jobc == 0) { sx_sunlock(&proctree_lock); return (EIO); } PGRP_LOCK(pgrp); sx_sunlock(&proctree_lock); pgsignal(pgrp, SIGTTOU, 1); PGRP_UNLOCK(pgrp); error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1", 0); if (error) return (error); sx_slock(&proctree_lock); PROC_LOCK(p); } PROC_UNLOCK(p); sx_sunlock(&proctree_lock); break; } switch (cmd) { /* Process the ioctl. */ case FIOASYNC: /* set/clear async i/o */ s = spltty(); if (*(int *)data) SET(tp->t_state, TS_ASYNC); else CLR(tp->t_state, TS_ASYNC); splx(s); break; case FIONBIO: /* set/clear non-blocking i/o */ break; /* XXX: delete. */ case FIONREAD: /* get # bytes to read */ s = spltty(); *(int *)data = ttnread(tp); splx(s); break; case FIOSETOWN: /* * Policy -- Don't allow FIOSETOWN on someone else's * controlling tty */ if (tp->t_session != NULL && !isctty(p, tp)) return (ENOTTY); error = fsetown(*(int *)data, &tp->t_sigio); if (error) return (error); break; case FIOGETOWN: if (tp->t_session != NULL && !isctty(p, tp)) return (ENOTTY); *(int *)data = fgetown(&tp->t_sigio); break; case TIOCEXCL: /* set exclusive use of tty */ s = spltty(); SET(tp->t_state, TS_XCLUDE); splx(s); break; case TIOCFLUSH: { /* flush buffers */ int flags = *(int *)data; if (flags == 0) flags = FREAD | FWRITE; else flags &= FREAD | FWRITE; ttyflush(tp, flags); break; } case TIOCCONS: /* become virtual console */ if (*(int *)data) { struct nameidata nid; if (constty && constty != tp && ISSET(constty->t_state, TS_CONNECTED)) return (EBUSY); /* Ensure user can open the real console. */ NDINIT(&nid, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, "/dev/console", td); if ((error = namei(&nid)) != 0) return (error); NDFREE(&nid, NDF_ONLY_PNBUF); error = VOP_ACCESS(nid.ni_vp, VREAD, td->td_ucred, td); vput(nid.ni_vp); if (error) return (error); constty = tp; } else if (tp == constty) constty = NULL; break; case TIOCDRAIN: /* wait till output drained */ error = ttywait(tp); if (error) return (error); break; case TIOCGETA: { /* get termios struct */ struct termios *t = (struct termios *)data; bcopy(&tp->t_termios, t, sizeof(struct termios)); break; } case TIOCGETD: /* get line discipline */ *(int *)data = tp->t_line; break; case TIOCGWINSZ: /* get window size */ *(struct winsize *)data = tp->t_winsize; break; case TIOCGPGRP: /* get pgrp of tty */ if (!isctty(p, tp)) return (ENOTTY); *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; break; #ifdef TIOCHPCL case TIOCHPCL: /* hang up on last close */ s = spltty(); SET(tp->t_cflag, HUPCL); splx(s); break; #endif case TIOCNXCL: /* reset exclusive use of tty */ s = spltty(); CLR(tp->t_state, TS_XCLUDE); splx(s); break; case TIOCOUTQ: /* output queue size */ *(int *)data = tp->t_outq.c_cc; break; case TIOCSETA: /* set termios struct */ case TIOCSETAW: /* drain output, set */ case TIOCSETAF: { /* drn out, fls in, set */ struct termios *t = (struct termios *)data; if (t->c_ispeed == 0) t->c_ispeed = t->c_ospeed; if (t->c_ispeed == 0) t->c_ispeed = tp->t_ospeed; if (t->c_ispeed == 0) return (EINVAL); s = spltty(); if (cmd == TIOCSETAW || cmd == TIOCSETAF) { error = ttywait(tp); if (error) { splx(s); return (error); } if (cmd == TIOCSETAF) ttyflush(tp, FREAD); } if (!ISSET(t->c_cflag, CIGNORE)) { /* * Set device hardware. */ if (tp->t_param && (error = (*tp->t_param)(tp, t))) { splx(s); return (error); } if (ISSET(t->c_cflag, CLOCAL) && !ISSET(tp->t_cflag, CLOCAL)) { /* * XXX disconnections would be too hard to * get rid of without this kludge. The only * way to get rid of controlling terminals * is to exit from the session leader. */ CLR(tp->t_state, TS_ZOMBIE); wakeup(TSA_CARR_ON(tp)); ttwakeup(tp); ttwwakeup(tp); } if ((ISSET(tp->t_state, TS_CARR_ON) || ISSET(t->c_cflag, CLOCAL)) && !ISSET(tp->t_state, TS_ZOMBIE)) SET(tp->t_state, TS_CONNECTED); else CLR(tp->t_state, TS_CONNECTED); tp->t_cflag = t->c_cflag; tp->t_ispeed = t->c_ispeed; if (t->c_ospeed != 0) tp->t_ospeed = t->c_ospeed; ttsetwater(tp); } if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) && cmd != TIOCSETAF) { if (ISSET(t->c_lflag, ICANON)) SET(tp->t_lflag, PENDIN); else { /* * XXX we really shouldn't allow toggling * ICANON while we're in a non-termios line * discipline. Now we have to worry about * panicing for a null queue. */ if (tp->t_canq.c_cbreserved > 0 && tp->t_rawq.c_cbreserved > 0) { catq(&tp->t_rawq, &tp->t_canq); /* * XXX the queue limits may be * different, so the old queue * swapping method no longer works. */ catq(&tp->t_canq, &tp->t_rawq); } CLR(tp->t_lflag, PENDIN); } ttwakeup(tp); } tp->t_iflag = t->c_iflag; tp->t_oflag = t->c_oflag; /* * Make the EXTPROC bit read only. */ if (ISSET(tp->t_lflag, EXTPROC)) SET(t->c_lflag, EXTPROC); else CLR(t->c_lflag, EXTPROC); tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); if (t->c_cc[VMIN] != tp->t_cc[VMIN] || t->c_cc[VTIME] != tp->t_cc[VTIME]) ttwakeup(tp); bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); splx(s); break; } case TIOCSETD: { /* set line discipline */ int t = *(int *)data; dev_t device = tp->t_dev; if ((u_int)t >= nlinesw) return (ENXIO); if (t != tp->t_line) { s = spltty(); (*linesw[tp->t_line].l_close)(tp, flag); error = (*linesw[t].l_open)(device, tp); if (error) { (void)(*linesw[tp->t_line].l_open)(device, tp); splx(s); return (error); } tp->t_line = t; splx(s); } break; } case TIOCSTART: /* start output, like ^Q */ s = spltty(); if (ISSET(tp->t_state, TS_TTSTOP) || ISSET(tp->t_lflag, FLUSHO)) { CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_TTSTOP); ttstart(tp); } splx(s); break; case TIOCSTI: /* simulate terminal input */ if ((flag & FREAD) == 0 && suser(td)) return (EPERM); if (!isctty(p, tp) && suser(td)) return (EACCES); s = spltty(); (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); splx(s); break; case TIOCSTOP: /* stop output, like ^S */ s = spltty(); if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP); (*tp->t_stop)(tp, 0); } splx(s); break; case TIOCSCTTY: /* become controlling tty */ /* Session ctty vnode pointer set in vnode layer. */ sx_slock(&proctree_lock); if (!SESS_LEADER(p) || ((p->p_session->s_ttyvp || tp->t_session) && (tp->t_session != p->p_session))) { sx_sunlock(&proctree_lock); return (EPERM); } tp->t_session = p->p_session; tp->t_pgrp = p->p_pgrp; SESS_LOCK(p->p_session); p->p_session->s_ttyp = tp; SESS_UNLOCK(p->p_session); PROC_LOCK(p); p->p_flag |= P_CONTROLT; PROC_UNLOCK(p); sx_sunlock(&proctree_lock); break; case TIOCSPGRP: { /* set pgrp of tty */ sx_slock(&proctree_lock); pgrp = pgfind(*(int *)data); if (!isctty(p, tp)) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); sx_sunlock(&proctree_lock); return (ENOTTY); } if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (EPERM); } PGRP_UNLOCK(pgrp); if (pgrp->pg_session != p->p_session) { sx_sunlock(&proctree_lock); return (EPERM); } sx_sunlock(&proctree_lock); tp->t_pgrp = pgrp; break; } case TIOCSTAT: /* simulate control-T */ s = spltty(); ttyinfo(tp); splx(s); break; case TIOCSWINSZ: /* set window size */ if (bcmp((caddr_t)&tp->t_winsize, data, sizeof (struct winsize))) { tp->t_winsize = *(struct winsize *)data; if (tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, SIGWINCH, 1); PGRP_UNLOCK(tp->t_pgrp); } } break; case TIOCSDRAINWAIT: error = suser(td); if (error) return (error); tp->t_timeout = *(int *)data * hz; wakeup(TSA_OCOMPLETE(tp)); wakeup(TSA_OLOWAT(tp)); break; case TIOCGDRAINWAIT: *(int *)data = tp->t_timeout / hz; break; default: #if defined(COMPAT_43) || defined(COMPAT_SUNOS) return (ttcompat(tp, cmd, data, flag)); #else return (ENOIOCTL); #endif } return (0); } int ttypoll(dev_t dev, int events, struct thread *td) { int s; int revents = 0; struct tty *tp; tp = dev->si_tty; if (tp == NULL) /* XXX used to return ENXIO, but that means true! */ return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)) | POLLHUP); s = spltty(); if (events & (POLLIN | POLLRDNORM)) { if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &tp->t_rsel); } if (events & (POLLOUT | POLLWRNORM)) { if ((tp->t_outq.c_cc <= tp->t_olowat && ISSET(tp->t_state, TS_CONNECTED)) || ISSET(tp->t_state, TS_ZOMBIE)) revents |= events & (POLLOUT | POLLWRNORM); else selrecord(td, &tp->t_wsel); } splx(s); return (revents); } static struct filterops ttyread_filtops = { 1, NULL, filt_ttyrdetach, filt_ttyread }; static struct filterops ttywrite_filtops = { 1, NULL, filt_ttywdetach, filt_ttywrite }; int ttykqfilter(dev_t dev, struct knote *kn) { struct tty *tp = dev->si_tty; struct klist *klist; int s; switch (kn->kn_filter) { case EVFILT_READ: klist = &tp->t_rsel.si_note; kn->kn_fop = &ttyread_filtops; break; case EVFILT_WRITE: klist = &tp->t_wsel.si_note; kn->kn_fop = &ttywrite_filtops; break; default: return (1); } kn->kn_hook = (caddr_t)dev; s = spltty(); SLIST_INSERT_HEAD(klist, kn, kn_selnext); splx(s); return (0); } static void filt_ttyrdetach(struct knote *kn) { struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; int s = spltty(); SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext); splx(s); } static int filt_ttyread(struct knote *kn, long hint) { struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; kn->kn_data = ttnread(tp); if (ISSET(tp->t_state, TS_ZOMBIE)) { kn->kn_flags |= EV_EOF; return (1); } return (kn->kn_data > 0); } static void filt_ttywdetach(struct knote *kn) { struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; int s = spltty(); SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext); splx(s); } static int filt_ttywrite(struct knote *kn, long hint) { struct tty *tp = ((dev_t)kn->kn_hook)->si_tty; kn->kn_data = tp->t_outq.c_cc; if (ISSET(tp->t_state, TS_ZOMBIE)) return (1); return (kn->kn_data <= tp->t_olowat && ISSET(tp->t_state, TS_CONNECTED)); } /* * Must be called at spltty(). */ static int ttnread(struct tty *tp) { int nread; if (ISSET(tp->t_lflag, PENDIN)) ttypend(tp); nread = tp->t_canq.c_cc; if (!ISSET(tp->t_lflag, ICANON)) { nread += tp->t_rawq.c_cc; if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0) nread = 0; } return (nread); } /* * Wait for output to drain. */ int ttywait(struct tty *tp) { int error, s; error = 0; s = spltty(); while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) { (*tp->t_oproc)(tp); if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && ISSET(tp->t_state, TS_CONNECTED)) { SET(tp->t_state, TS_SO_OCOMPLETE); error = ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI | PCATCH, "ttywai", tp->t_timeout); if (error) { if (error == EWOULDBLOCK) error = EIO; break; } } else break; } if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY))) error = EIO; splx(s); return (error); } /* * Flush if successfully wait. */ static int ttywflush(struct tty *tp) { int error; if ((error = ttywait(tp)) == 0) ttyflush(tp, FREAD); return (error); } /* * Flush tty read and/or write queues, notifying anyone waiting. */ void ttyflush(struct tty *tp, int rw) { int s; s = spltty(); #if 0 again: #endif if (rw & FWRITE) { FLUSHQ(&tp->t_outq); CLR(tp->t_state, TS_TTSTOP); } (*tp->t_stop)(tp, rw); if (rw & FREAD) { FLUSHQ(&tp->t_canq); FLUSHQ(&tp->t_rawq); CLR(tp->t_lflag, PENDIN); tp->t_rocount = 0; tp->t_rocol = 0; CLR(tp->t_state, TS_LOCAL); ttwakeup(tp); if (ISSET(tp->t_state, TS_TBLOCK)) { if (rw & FWRITE) FLUSHQ(&tp->t_outq); ttyunblock(tp); /* * Don't let leave any state that might clobber the * next line discipline (although we should do more * to send the START char). Not clearing the state * may have caused the "putc to a clist with no * reserved cblocks" panic/printf. */ CLR(tp->t_state, TS_TBLOCK); #if 0 /* forget it, sleeping isn't always safe and we don't know when it is */ if (ISSET(tp->t_iflag, IXOFF)) { /* * XXX wait a bit in the hope that the stop * character (if any) will go out. Waiting * isn't good since it allows races. This * will be fixed when the stop character is * put in a special queue. Don't bother with * the checks in ttywait() since the timeout * will save us. */ SET(tp->t_state, TS_SO_OCOMPLETE); ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI, "ttyfls", hz / 10); /* * Don't try sending the stop character again. */ CLR(tp->t_state, TS_TBLOCK); goto again; } #endif } } if (rw & FWRITE) { FLUSHQ(&tp->t_outq); ttwwakeup(tp); } splx(s); } /* * Copy in the default termios characters. */ void termioschars(struct termios *t) { bcopy(ttydefchars, t->c_cc, sizeof t->c_cc); } /* * Old interface. */ void ttychars(struct tty *tp) { termioschars(&tp->t_termios); } /* * Handle input high water. Send stop character for the IXOFF case. Turn * on our input flow control bit and propagate the changes to the driver. * XXX the stop character should be put in a special high priority queue. */ void ttyblock(struct tty *tp) { SET(tp->t_state, TS_TBLOCK); if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE && putc(tp->t_cc[VSTOP], &tp->t_outq) != 0) CLR(tp->t_state, TS_TBLOCK); /* try again later */ ttstart(tp); } /* * Handle input low water. Send start character for the IXOFF case. Turn * off our input flow control bit and propagate the changes to the driver. * XXX the start character should be put in a special high priority queue. */ static void ttyunblock(struct tty *tp) { CLR(tp->t_state, TS_TBLOCK); if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE && putc(tp->t_cc[VSTART], &tp->t_outq) != 0) SET(tp->t_state, TS_TBLOCK); /* try again later */ ttstart(tp); } #ifdef notyet /* Not used by any current (i386) drivers. */ /* * Restart after an inter-char delay. */ void ttrstrt(void *tp_arg) { struct tty *tp; int s; KASSERT(tp_arg != NULL, ("ttrstrt")); tp = tp_arg; s = spltty(); CLR(tp->t_state, TS_TIMEOUT); ttstart(tp); splx(s); } #endif int ttstart(struct tty *tp) { if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ (*tp->t_oproc)(tp); return (0); } /* * "close" a line discipline */ int ttylclose(struct tty *tp, int flag) { if (flag & FNONBLOCK || ttywflush(tp)) ttyflush(tp, FREAD | FWRITE); return (0); } /* * Handle modem control transition on a tty. * Flag indicates new state of carrier. * Returns 0 if the line should be turned off, otherwise 1. */ int ttymodem(struct tty *tp, int flag) { if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) { /* * MDMBUF: do flow control according to carrier flag * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP * works if IXON and IXANY are clear. */ if (flag) { CLR(tp->t_state, TS_CAR_OFLOW); CLR(tp->t_state, TS_TTSTOP); ttstart(tp); } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) { SET(tp->t_state, TS_CAR_OFLOW); SET(tp->t_state, TS_TTSTOP); (*tp->t_stop)(tp, 0); } } else if (flag == 0) { /* * Lost carrier. */ CLR(tp->t_state, TS_CARR_ON); if (ISSET(tp->t_state, TS_ISOPEN) && !ISSET(tp->t_cflag, CLOCAL)) { SET(tp->t_state, TS_ZOMBIE); CLR(tp->t_state, TS_CONNECTED); if (tp->t_session) { sx_slock(&proctree_lock); if (tp->t_session->s_leader) { struct proc *p; p = tp->t_session->s_leader; PROC_LOCK(p); psignal(p, SIGHUP); PROC_UNLOCK(p); } sx_sunlock(&proctree_lock); } ttyflush(tp, FREAD | FWRITE); return (0); } } else { /* * Carrier now on. */ SET(tp->t_state, TS_CARR_ON); if (!ISSET(tp->t_state, TS_ZOMBIE)) SET(tp->t_state, TS_CONNECTED); wakeup(TSA_CARR_ON(tp)); ttwakeup(tp); ttwwakeup(tp); } return (1); } /* * Reinput pending characters after state switch * call at spltty(). */ static void ttypend(struct tty *tp) { struct clist tq; int c; CLR(tp->t_lflag, PENDIN); SET(tp->t_state, TS_TYPEN); /* * XXX this assumes too much about clist internals. It may even * fail if the cblock slush pool is empty. We can't allocate more * cblocks here because we are called from an interrupt handler * and clist_alloc_cblocks() can wait. */ tq = tp->t_rawq; bzero(&tp->t_rawq, sizeof tp->t_rawq); tp->t_rawq.c_cbmax = tq.c_cbmax; tp->t_rawq.c_cbreserved = tq.c_cbreserved; while ((c = getc(&tq)) >= 0) ttyinput(c, tp); CLR(tp->t_state, TS_TYPEN); } /* * Process a read call on a tty device. */ int ttread(struct tty *tp, struct uio *uio, int flag) { struct clist *qp; int c; tcflag_t lflag; cc_t *cc = tp->t_cc; struct thread *td; struct proc *p; int s, first, error = 0; int has_stime = 0, last_cc = 0; long slp = 0; /* XXX this should be renamed `timo'. */ struct timeval stime; struct pgrp *pg; td = curthread; p = td->td_proc; loop: s = spltty(); lflag = tp->t_lflag; /* * take pending input first */ if (ISSET(lflag, PENDIN)) { ttypend(tp); splx(s); /* reduce latency */ s = spltty(); lflag = tp->t_lflag; /* XXX ttypend() clobbers it */ } /* * Hang process if it's in the background. */ if (isbackground(p, tp)) { splx(s); sx_slock(&proctree_lock); PROC_LOCK(p); if (SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTIN) || SIGISMEMBER(td->td_sigmask, SIGTTIN) || (p->p_flag & P_PPWAIT) || p->p_pgrp->pg_jobc == 0) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); return (EIO); } pg = p->p_pgrp; PROC_UNLOCK(p); PGRP_LOCK(pg); sx_sunlock(&proctree_lock); pgsignal(pg, SIGTTIN, 1); PGRP_UNLOCK(pg); error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0); if (error) return (error); goto loop; } if (ISSET(tp->t_state, TS_ZOMBIE)) { splx(s); return (0); /* EOF */ } /* * If canonical, use the canonical queue, * else use the raw queue. * * (should get rid of clists...) */ qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; if (flag & IO_NDELAY) { if (qp->c_cc > 0) goto read; if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) { splx(s); return (0); } splx(s); return (EWOULDBLOCK); } if (!ISSET(lflag, ICANON)) { int m = cc[VMIN]; long t = cc[VTIME]; struct timeval timecopy; /* * Check each of the four combinations. * (m > 0 && t == 0) is the normal read case. * It should be fairly efficient, so we check that and its * companion case (m == 0 && t == 0) first. * For the other two cases, we compute the target sleep time * into slp. */ if (t == 0) { if (qp->c_cc < m) goto sleep; if (qp->c_cc > 0) goto read; /* m, t and qp->c_cc are all 0. 0 is enough input. */ splx(s); return (0); } t *= 100000; /* time in us */ #define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \ ((t1).tv_usec - (t2).tv_usec)) if (m > 0) { if (qp->c_cc <= 0) goto sleep; if (qp->c_cc >= m) goto read; getmicrotime(&timecopy); if (!has_stime) { /* first character, start timer */ has_stime = 1; stime = timecopy; slp = t; } else if (qp->c_cc > last_cc) { /* got a character, restart timer */ stime = timecopy; slp = t; } else { /* nothing, check expiration */ slp = t - diff(timecopy, stime); if (slp <= 0) goto read; } last_cc = qp->c_cc; } else { /* m == 0 */ if (qp->c_cc > 0) goto read; getmicrotime(&timecopy); if (!has_stime) { has_stime = 1; stime = timecopy; slp = t; } else { slp = t - diff(timecopy, stime); if (slp <= 0) { /* Timed out, but 0 is enough input. */ splx(s); return (0); } } } #undef diff /* * Rounding down may make us wake up just short * of the target, so we round up. * The formula is ceiling(slp * hz/1000000). * 32-bit arithmetic is enough for hz < 169. * XXX see tvtohz() for how to avoid overflow if hz * is large (divide by `tick' and/or arrange to * use tvtohz() if hz is large). */ slp = (long) (((u_long)slp * hz) + 999999) / 1000000; goto sleep; } if (qp->c_cc <= 0) { sleep: /* * There is no input, or not enough input and we can block. */ error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH, ISSET(tp->t_state, TS_CONNECTED) ? "ttyin" : "ttyhup", (int)slp); splx(s); if (error == EWOULDBLOCK) error = 0; else if (error) return (error); /* * XXX what happens if another process eats some input * while we are asleep (not just here)? It would be * safest to detect changes and reset our state variables * (has_stime and last_cc). */ slp = 0; goto loop; } read: splx(s); /* * Input present, check for input mapping and processing. */ first = 1; if (ISSET(lflag, ICANON | ISIG)) goto slowcase; for (;;) { char ibuf[IBUFSIZ]; int icc; icc = imin(uio->uio_resid, IBUFSIZ); icc = q_to_b(qp, ibuf, icc); if (icc <= 0) { if (first) goto loop; break; } error = uiomove(ibuf, icc, uio); /* * XXX if there was an error then we should ungetc() the * unmoved chars and reduce icc here. */ if (error) break; if (uio->uio_resid == 0) break; first = 0; } goto out; slowcase: for (;;) { c = getc(qp); if (c < 0) { if (first) goto loop; break; } /* * delayed suspend (^Y) */ if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) { if (tp->t_pgrp != NULL) { PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, SIGTSTP, 1); PGRP_UNLOCK(tp->t_pgrp); } if (first) { error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg3", 0); if (error) break; goto loop; } break; } /* * Interpret EOF only in canonical mode. */ if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) break; /* * Give user character. */ error = ureadc(c, uio); if (error) /* XXX should ungetc(c, qp). */ break; if (uio->uio_resid == 0) break; /* * In canonical mode check for a "break character" * marking the end of a "line of input". */ if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag)) break; first = 0; } out: /* * Look to unblock input now that (presumably) * the input queue has gone down. */ s = spltty(); if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat) ttyunblock(tp); splx(s); return (error); } /* * Check the output queue on tp for space for a kernel message (from uprintf * or tprintf). Allow some space over the normal hiwater mark so we don't * lose messages due to normal flow control, but don't let the tty run amok. * Sleeps here are not interruptible, but we return prematurely if new signals * arrive. */ int ttycheckoutq(struct tty *tp, int wait) { int hiwat, s; sigset_t oldmask; struct thread *td; struct proc *p; td = curthread; p = td->td_proc; hiwat = tp->t_ohiwat; SIGEMPTYSET(oldmask); s = spltty(); if (wait) { PROC_LOCK(p); oldmask = td->td_siglist; PROC_UNLOCK(p); } if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100) while (tp->t_outq.c_cc > hiwat) { ttstart(tp); if (tp->t_outq.c_cc <= hiwat) break; if (!wait) { splx(s); return (0); } PROC_LOCK(p); if (!SIGSETEQ(td->td_siglist, oldmask)) { PROC_UNLOCK(p); splx(s); return (0); } PROC_UNLOCK(p); SET(tp->t_state, TS_SO_OLOWAT); tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz); } splx(s); return (1); } /* * Process a write call on a tty device. */ int ttwrite(struct tty *tp, struct uio *uio, int flag) { char *cp = NULL; int cc, ce; struct thread *td; struct proc *p; int i, hiwat, cnt, error, s; char obuf[OBUFSIZ]; hiwat = tp->t_ohiwat; cnt = uio->uio_resid; error = 0; cc = 0; td = curthread; p = td->td_proc; loop: s = spltty(); if (ISSET(tp->t_state, TS_ZOMBIE)) { splx(s); if (uio->uio_resid == cnt) error = EIO; goto out; } if (!ISSET(tp->t_state, TS_CONNECTED)) { if (flag & IO_NDELAY) { splx(s); error = EWOULDBLOCK; goto out; } error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, "ttydcd", 0); splx(s); if (error) goto out; goto loop; } splx(s); /* * Hang the process if it's in the background. */ sx_slock(&proctree_lock); PROC_LOCK(p); if (isbackground(p, tp) && ISSET(tp->t_lflag, TOSTOP) && !(p->p_flag & P_PPWAIT) && !SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTOU) && !SIGISMEMBER(td->td_sigmask, SIGTTOU)) { if (p->p_pgrp->pg_jobc == 0) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); error = EIO; goto out; } PROC_UNLOCK(p); PGRP_LOCK(p->p_pgrp); sx_sunlock(&proctree_lock); pgsignal(p->p_pgrp, SIGTTOU, 1); PGRP_UNLOCK(p->p_pgrp); error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0); if (error) goto out; goto loop; } else { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } /* * Process the user's data in at most OBUFSIZ chunks. Perform any * output translation. Keep track of high water mark, sleep on * overflow awaiting device aid in acquiring new space. */ while (uio->uio_resid > 0 || cc > 0) { if (ISSET(tp->t_lflag, FLUSHO)) { uio->uio_resid = 0; return (0); } if (tp->t_outq.c_cc > hiwat) goto ovhiwat; /* * Grab a hunk of data from the user, unless we have some * leftover from last time. */ if (cc == 0) { cc = imin(uio->uio_resid, OBUFSIZ); cp = obuf; error = uiomove(cp, cc, uio); if (error) { cc = 0; break; } } /* * If nothing fancy need be done, grab those characters we * can handle without any of ttyoutput's processing and * just transfer them to the output q. For those chars * which require special processing (as indicated by the * bits in char_type), call ttyoutput. After processing * a hunk of data, look for FLUSHO so ^O's will take effect * immediately. */ while (cc > 0) { if (!ISSET(tp->t_oflag, OPOST)) ce = cc; else { ce = cc - scanc((u_int)cc, (u_char *)cp, char_type, CCLASSMASK); /* * If ce is zero, then we're processing * a special character through ttyoutput. */ if (ce == 0) { tp->t_rocount = 0; if (ttyoutput(*cp, tp) >= 0) { /* No Clists, wait a bit. */ ttstart(tp); if (flag & IO_NDELAY) { error = EWOULDBLOCK; goto out; } error = ttysleep(tp, &lbolt, TTOPRI|PCATCH, "ttybf1", 0); if (error) goto out; goto loop; } cp++; cc--; if (ISSET(tp->t_lflag, FLUSHO) || tp->t_outq.c_cc > hiwat) goto ovhiwat; continue; } } /* * A bunch of normal characters have been found. * Transfer them en masse to the output queue and * continue processing at the top of the loop. * If there are any further characters in this * <= OBUFSIZ chunk, the first should be a character * requiring special handling by ttyoutput. */ tp->t_rocount = 0; i = b_to_q(cp, ce, &tp->t_outq); ce -= i; tp->t_column += ce; cp += ce, cc -= ce, tk_nout += ce; tp->t_outcc += ce; if (i > 0) { /* No Clists, wait a bit. */ ttstart(tp); if (flag & IO_NDELAY) { error = EWOULDBLOCK; goto out; } error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybf2", 0); if (error) goto out; goto loop; } if (ISSET(tp->t_lflag, FLUSHO) || tp->t_outq.c_cc > hiwat) break; } ttstart(tp); } out: /* * If cc is nonzero, we leave the uio structure inconsistent, as the * offset and iov pointers have moved forward, but it doesn't matter * (the call will either return short or restart with a new uio). */ uio->uio_resid += cc; return (error); ovhiwat: ttstart(tp); s = spltty(); /* * This can only occur if FLUSHO is set in t_lflag, * or if ttstart/oproc is synchronous (or very fast). */ if (tp->t_outq.c_cc <= hiwat) { splx(s); goto loop; } if (flag & IO_NDELAY) { splx(s); uio->uio_resid += cc; return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); } SET(tp->t_state, TS_SO_OLOWAT); error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri", tp->t_timeout); splx(s); if (error == EWOULDBLOCK) error = EIO; if (error) goto out; goto loop; } /* * Rubout one character from the rawq of tp * as cleanly as possible. */ static void ttyrub(int c, struct tty *tp) { char *cp; int savecol; int tabc, s; if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) return; CLR(tp->t_lflag, FLUSHO); if (ISSET(tp->t_lflag, ECHOE)) { if (tp->t_rocount == 0) { /* * Screwed by ttwrite; retype */ ttyretype(tp); return; } if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) ttyrubo(tp, 2); else { CLR(c, ~TTY_CHARMASK); switch (CCLASS(c)) { case ORDINARY: ttyrubo(tp, 1); break; case BACKSPACE: case CONTROL: case NEWLINE: case RETURN: case VTAB: if (ISSET(tp->t_lflag, ECHOCTL)) ttyrubo(tp, 2); break; case TAB: if (tp->t_rocount < tp->t_rawq.c_cc) { ttyretype(tp); return; } s = spltty(); savecol = tp->t_column; SET(tp->t_state, TS_CNTTB); SET(tp->t_lflag, FLUSHO); tp->t_column = tp->t_rocol; cp = tp->t_rawq.c_cf; if (cp) tabc = *cp; /* XXX FIX NEXTC */ for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc)) ttyecho(tabc, tp); CLR(tp->t_lflag, FLUSHO); CLR(tp->t_state, TS_CNTTB); splx(s); /* savecol will now be length of the tab. */ savecol -= tp->t_column; tp->t_column += savecol; if (savecol > 8) savecol = 8; /* overflow screw */ while (--savecol >= 0) (void)ttyoutput('\b', tp); break; default: /* XXX */ #define PANICSTR "ttyrub: would panic c = %d, val = %d\n" (void)printf(PANICSTR, c, CCLASS(c)); #ifdef notdef panic(PANICSTR, c, CCLASS(c)); #endif } } } else if (ISSET(tp->t_lflag, ECHOPRT)) { if (!ISSET(tp->t_state, TS_ERASE)) { SET(tp->t_state, TS_ERASE); (void)ttyoutput('\\', tp); } ttyecho(c, tp); } else { ttyecho(tp->t_cc[VERASE], tp); /* * This code may be executed not only when an ERASE key * is pressed, but also when ^U (KILL) or ^W (WERASE) are. * So, I didn't think it was worthwhile to pass the extra * information (which would need an extra parameter, * changing every call) needed to distinguish the ERASE2 * case from the ERASE. */ } --tp->t_rocount; } /* * Back over cnt characters, erasing them. */ static void ttyrubo(struct tty *tp, int cnt) { while (cnt-- > 0) { (void)ttyoutput('\b', tp); (void)ttyoutput(' ', tp); (void)ttyoutput('\b', tp); } } /* * ttyretype -- * Reprint the rawq line. Note, it is assumed that c_cc has already * been checked. */ static void ttyretype(struct tty *tp) { char *cp; int s, c; /* Echo the reprint character. */ if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) ttyecho(tp->t_cc[VREPRINT], tp); (void)ttyoutput('\n', tp); /* * XXX * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE * BIT OF FIRST CHAR. */ s = spltty(); for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0); cp != NULL; cp = nextc(&tp->t_canq, cp, &c)) ttyecho(c, tp); for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0); cp != NULL; cp = nextc(&tp->t_rawq, cp, &c)) ttyecho(c, tp); CLR(tp->t_state, TS_ERASE); splx(s); tp->t_rocount = tp->t_rawq.c_cc; tp->t_rocol = 0; } /* * Echo a typed character to the terminal. */ static void ttyecho(int c, struct tty *tp) { if (!ISSET(tp->t_state, TS_CNTTB)) CLR(tp->t_lflag, FLUSHO); if ((!ISSET(tp->t_lflag, ECHO) && (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) || ISSET(tp->t_lflag, EXTPROC)) return; if (ISSET(tp->t_lflag, ECHOCTL) && ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') || ISSET(c, TTY_CHARMASK) == 0177)) { (void)ttyoutput('^', tp); CLR(c, ~TTY_CHARMASK); if (c == 0177) c = '?'; else c += 'A' - 1; } (void)ttyoutput(c, tp); } /* * Wake up any readers on a tty. */ void ttwakeup(struct tty *tp) { if (SEL_WAITING(&tp->t_rsel)) selwakeup(&tp->t_rsel); if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL) pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); wakeup(TSA_HUP_OR_INPUT(tp)); KNOTE(&tp->t_rsel.si_note, 0); } /* * Wake up any writers on a tty. */ void ttwwakeup(struct tty *tp) { if (SEL_WAITING(&tp->t_wsel) && tp->t_outq.c_cc <= tp->t_olowat) selwakeup(&tp->t_wsel); if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL) pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) == TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) { CLR(tp->t_state, TS_SO_OCOMPLETE); wakeup(TSA_OCOMPLETE(tp)); } if (ISSET(tp->t_state, TS_SO_OLOWAT) && tp->t_outq.c_cc <= tp->t_olowat) { CLR(tp->t_state, TS_SO_OLOWAT); wakeup(TSA_OLOWAT(tp)); } KNOTE(&tp->t_wsel.si_note, 0); } /* * Look up a code for a specified speed in a conversion table; * used by drivers to map software speed values to hardware parameters. */ int ttspeedtab(int speed, struct speedtab *table) { for ( ; table->sp_speed != -1; table++) if (table->sp_speed == speed) return (table->sp_code); return (-1); } /* * Set input and output watermarks and buffer sizes. For input, the * high watermark is about one second's worth of input above empty, the * low watermark is slightly below high water, and the buffer size is a * driver-dependent amount above high water. For output, the watermarks * are near the ends of the buffer, with about 1 second's worth of input * between them. All this only applies to the standard line discipline. */ void ttsetwater(struct tty *tp) { int cps, ttmaxhiwat, x; /* Input. */ clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512); switch (tp->t_ispeedwat) { case (speed_t)-1: cps = tp->t_ispeed / 10; break; case 0: /* * This case is for old drivers that don't know about * t_ispeedwat. Arrange for them to get the old buffer * sizes and watermarks. */ cps = TTYHOG - 2 * 256; tp->t_ififosize = 2 * 256; break; default: cps = tp->t_ispeedwat / 10; break; } tp->t_ihiwat = cps; tp->t_ilowat = 7 * cps / 8; x = cps + tp->t_ififosize; clist_alloc_cblocks(&tp->t_rawq, x, x); /* Output. */ switch (tp->t_ospeedwat) { case (speed_t)-1: cps = tp->t_ospeed / 10; ttmaxhiwat = 2 * TTMAXHIWAT; break; case 0: cps = tp->t_ospeed / 10; ttmaxhiwat = TTMAXHIWAT; break; default: cps = tp->t_ospeedwat / 10; ttmaxhiwat = 8 * TTMAXHIWAT; break; } #define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); x += cps; x = CLAMP(x, ttmaxhiwat, TTMINHIWAT); /* XXX clamps are too magic */ tp->t_ohiwat = roundup(x, CBSIZE); /* XXX for compat */ x = imax(tp->t_ohiwat, TTMAXHIWAT); /* XXX for compat/safety */ x += OBUFSIZ + 100; clist_alloc_cblocks(&tp->t_outq, x, x); #undef CLAMP } /* * Report on state of foreground process group. */ void ttyinfo(struct tty *tp) { struct proc *p, *pick; struct timeval utime, stime; const char *stmp, *sprefix; long ltmp; int tmp; struct thread *td; if (ttycheckoutq(tp,0) == 0) return; /* Print load average. */ tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100); if (tp->t_session == NULL) ttyprintf(tp, "not a controlling terminal\n"); else if (tp->t_pgrp == NULL) ttyprintf(tp, "no foreground process group\n"); else { PGRP_LOCK(tp->t_pgrp); if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == 0) { PGRP_UNLOCK(tp->t_pgrp); ttyprintf(tp, "empty foreground process group\n"); } else { mtx_lock_spin(&sched_lock); /* Pick interesting process. */ for (pick = NULL; p != 0; p = LIST_NEXT(p, p_pglist)) if (proc_compare(pick, p)) pick = p; PGRP_UNLOCK(tp->t_pgrp); td = FIRST_THREAD_IN_PROC(pick); sprefix = ""; - if (pick->p_flag & P_THREADED) { + if (pick->p_flag & P_SA) { stmp = "KSE" ; /* XXXKSE */ } else { if (td) { if (TD_ON_RUNQ(td) || (TD_IS_RUNNING(td))) { stmp = "running"; } else if (TD_ON_LOCK(td)) { stmp = td->td_lockname; sprefix = "*"; } else if (td->td_wmesg) { stmp = td->td_wmesg; } else { stmp = "iowait"; } } else { stmp = "threadless"; panic("ttyinfo: no thread!?"); } } calcru(pick, &utime, &stime, NULL); if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE) { ltmp = 0; } else { ltmp = pgtok( vmspace_resident_count(pick->p_vmspace)); } mtx_unlock_spin(&sched_lock); ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm, pick->p_pid, sprefix, stmp); /* Print user time. */ ttyprintf(tp, "%ld.%02ldu ", utime.tv_sec, utime.tv_usec / 10000); /* Print system time. */ ttyprintf(tp, "%ld.%02lds ", (long)stime.tv_sec, stime.tv_usec / 10000); /* Print percentage cpu, resident set size. */ ttyprintf(tp, "%d%% %ldk\n", tmp / 100, ltmp); } } tp->t_rocount = 0; /* so pending input will be retyped if BS */ } /* * Returns 1 if p2 is "better" than p1 * * The algorithm for picking the "interesting" process is thus: * * 1) Only foreground processes are eligible - implied. * 2) Runnable processes are favored over anything else. The runner * with the highest cpu utilization is picked (p_estcpu). Ties are * broken by picking the highest pid. * 3) The sleeper with the shortest sleep time is next. With ties, * we pick out just "short-term" sleepers (P_SINTR == 0). * 4) Further ties are broken by picking the highest pid. */ #define ISRUN(p, val) \ do { \ struct thread *td; \ val = 0; \ FOREACH_THREAD_IN_PROC(p, td) { \ if (TD_ON_RUNQ(td) || \ TD_IS_RUNNING(td)) { \ val = 1; \ break; \ } \ } \ } while (0) #define TESTAB(a, b) ((a)<<1 | (b)) #define ONLYA 2 #define ONLYB 1 #define BOTH 3 static int proc_compare(struct proc *p1, struct proc *p2) { int esta, estb; struct ksegrp *kg; mtx_assert(&sched_lock, MA_OWNED); if (p1 == NULL) return (1); ISRUN(p1, esta); ISRUN(p2, estb); /* * see if at least one of them is runnable */ switch (TESTAB(esta, estb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: /* * tie - favor one with highest recent cpu utilization */ esta = estb = 0; FOREACH_KSEGRP_IN_PROC(p1,kg) { esta += kg->kg_estcpu; } FOREACH_KSEGRP_IN_PROC(p2,kg) { estb += kg->kg_estcpu; } if (estb > esta) return (1); if (esta > estb) return (0); return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } /* * weed out zombies */ switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { case ONLYA: return (1); case ONLYB: return (0); case BOTH: return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } #if 0 /* XXXKSE */ /* * pick the one with the smallest sleep time */ if (p2->p_slptime > p1->p_slptime) return (0); if (p1->p_slptime > p2->p_slptime) return (1); /* * favor one sleeping in a non-interruptible sleep */ if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0) return (1); if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0) return (0); #endif return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } /* * Output char to tty; console putchar style. */ int tputchar(int c, struct tty *tp) { int s; s = spltty(); if (!ISSET(tp->t_state, TS_CONNECTED)) { splx(s); return (-1); } if (c == '\n') (void)ttyoutput('\r', tp); (void)ttyoutput(c, tp); ttstart(tp); splx(s); return (0); } /* * Sleep on chan, returning ERESTART if tty changed while we napped and * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If * the tty is revoked, restarting a pending call will redo validation done * at the start of the call. */ int ttysleep(struct tty *tp, void *chan, int pri, char *wmesg, int timo) { int error; int gen; gen = tp->t_gen; error = tsleep(chan, pri, wmesg, timo); if (error) return (error); return (tp->t_gen == gen ? 0 : ERESTART); } /* * Allocate a tty struct. Clists in the struct will be allocated by * ttyopen(). */ struct tty * ttymalloc(struct tty *tp) { if (tp) return(tp); tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO); ttyregister(tp); return (tp); } #if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */ /* * Free a tty struct. Clists in the struct should have been freed by * ttyclose(). */ void ttyfree(struct tty *tp) { free(tp, M_TTYS); } #endif /* 0 */ void ttyregister(struct tty *tp) { tp->t_timeout = -1; SLIST_INSERT_HEAD(&tty_list, tp, t_list); } static int sysctl_kern_ttys(SYSCTL_HANDLER_ARGS) { struct tty *tp; struct xtty xt; int error; SLIST_FOREACH(tp, &tty_list, t_list) { bzero(&xt, sizeof xt); xt.xt_size = sizeof xt; #define XT_COPY(field) xt.xt_##field = tp->t_##field xt.xt_rawcc = tp->t_rawq.c_cc; xt.xt_cancc = tp->t_canq.c_cc; xt.xt_outcc = tp->t_outq.c_cc; XT_COPY(line); if (tp->t_dev) xt.xt_dev = dev2udev(tp->t_dev); XT_COPY(state); XT_COPY(flags); XT_COPY(timeout); if (tp->t_pgrp) xt.xt_pgid = tp->t_pgrp->pg_id; if (tp->t_session) xt.xt_sid = tp->t_session->s_sid; XT_COPY(termios); XT_COPY(winsize); XT_COPY(column); XT_COPY(rocount); XT_COPY(rocol); XT_COPY(ififosize); XT_COPY(ihiwat); XT_COPY(ilowat); XT_COPY(ispeedwat); XT_COPY(ohiwat); XT_COPY(olowat); XT_COPY(ospeedwat); #undef XT_COPY error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) return (error); } return (0); } SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_ttys, "S,xtty", "All ttys"); SYSCTL_LONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD, &tk_nin, 0, "Total TTY in characters"); SYSCTL_LONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD, &tk_nout, 0, "Total TTY out characters"); void nottystop(struct tty *tp, int rw) { return; } int ttyread(dev_t dev, struct uio *uio, int flag) { struct tty *tp; tp = dev->si_tty; if (tp == NULL) return (ENODEV); return ((*linesw[tp->t_line].l_read)(tp, uio, flag)); } int ttywrite(dev_t dev, struct uio *uio, int flag) { struct tty *tp; tp = dev->si_tty; if (tp == NULL) return (ENODEV); return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); } Index: head/sys/sparc64/sparc64/trap.c =================================================================== --- head/sys/sparc64/sparc64/trap.c (revision 116360) +++ head/sys/sparc64/sparc64/trap.c (revision 116361) @@ -1,630 +1,630 @@ /*- * Copyright (c) 2001, Jake Burkholder * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * from: FreeBSD: src/sys/i386/i386/trap.c,v 1.197 2001/07/19 * $FreeBSD$ */ #include "opt_ddb.h" #include "opt_ktr.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void trap(struct trapframe *tf); void syscall(struct trapframe *tf); static int trap_pfault(struct thread *td, struct trapframe *tf); extern char copy_fault[]; extern char copy_nofault_begin[]; extern char copy_nofault_end[]; extern char fs_fault[]; extern char fs_nofault_begin[]; extern char fs_nofault_end[]; extern char fs_nofault_intr_begin[]; extern char fs_nofault_intr_end[]; extern char *syscallnames[]; const char *trap_msg[] = { "reserved", "instruction access exception", "instruction access error", "instruction access protection", "illtrap instruction", "illegal instruction", "privileged opcode", "floating point disabled", "floating point exception ieee 754", "floating point exception other", "tag overflow", "division by zero", "data access exception", "data access error", "data access protection", "memory address not aligned", "privileged action", "async data error", "trap instruction 16", "trap instruction 17", "trap instruction 18", "trap instruction 19", "trap instruction 20", "trap instruction 21", "trap instruction 22", "trap instruction 23", "trap instruction 24", "trap instruction 25", "trap instruction 26", "trap instruction 27", "trap instruction 28", "trap instruction 29", "trap instruction 30", "trap instruction 31", "fast instruction access mmu miss", "fast data access mmu miss", "interrupt", "physical address watchpoint", "virtual address watchpoint", "corrected ecc error", "spill", "fill", "fill", "breakpoint", "clean window", "range check", "fix alignment", "integer overflow", "syscall", "restore physical watchpoint", "restore virtual watchpoint", "kernel stack fault", }; const int trap_sig[] = { SIGILL, /* reserved */ SIGILL, /* instruction access exception */ SIGILL, /* instruction access error */ SIGILL, /* instruction access protection */ SIGILL, /* illtrap instruction */ SIGILL, /* illegal instruction */ SIGBUS, /* privileged opcode */ SIGFPE, /* floating point disabled */ SIGFPE, /* floating point exception ieee 754 */ SIGFPE, /* floating point exception other */ SIGEMT, /* tag overflow */ SIGFPE, /* division by zero */ SIGILL, /* data access exception */ SIGILL, /* data access error */ SIGBUS, /* data access protection */ SIGBUS, /* memory address not aligned */ SIGBUS, /* privileged action */ SIGBUS, /* async data error */ SIGILL, /* trap instruction 16 */ SIGILL, /* trap instruction 17 */ SIGILL, /* trap instruction 18 */ SIGILL, /* trap instruction 19 */ SIGILL, /* trap instruction 20 */ SIGILL, /* trap instruction 21 */ SIGILL, /* trap instruction 22 */ SIGILL, /* trap instruction 23 */ SIGILL, /* trap instruction 24 */ SIGILL, /* trap instruction 25 */ SIGILL, /* trap instruction 26 */ SIGILL, /* trap instruction 27 */ SIGILL, /* trap instruction 28 */ SIGILL, /* trap instruction 29 */ SIGILL, /* trap instruction 30 */ SIGILL, /* trap instruction 31 */ SIGSEGV, /* fast instruction access mmu miss */ SIGSEGV, /* fast data access mmu miss */ -1, /* interrupt */ -1, /* physical address watchpoint */ -1, /* virtual address watchpoint */ -1, /* corrected ecc error */ SIGILL, /* spill */ SIGILL, /* fill */ SIGILL, /* fill */ SIGTRAP, /* breakpoint */ SIGILL, /* clean window */ SIGILL, /* range check */ SIGILL, /* fix alignment */ SIGILL, /* integer overflow */ SIGSYS, /* syscall */ -1, /* restore physical watchpoint */ -1, /* restore virtual watchpoint */ -1, /* kernel stack fault */ }; CTASSERT(sizeof(struct trapframe) == 256); int debugger_on_signal = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_signal, CTLFLAG_RW, &debugger_on_signal, 0, ""); void trap(struct trapframe *tf) { struct thread *td; struct proc *p; u_int sticks; int error; int sig; td = PCPU_GET(curthread); CTR4(KTR_TRAP, "trap: %p type=%s (%s) pil=%#lx", td, trap_msg[tf->tf_type & ~T_KERNEL], (TRAPF_USERMODE(tf) ? "user" : "kernel"), rdpr(pil)); atomic_add_int(&cnt.v_trap, 1); if ((tf->tf_tstate & TSTATE_PRIV) == 0) { KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); p = td->td_proc; sticks = td->td_sticks; td->td_frame = tf; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (tf->tf_type) { case T_DATA_MISS: case T_DATA_PROTECTION: case T_INSTRUCTION_MISS: sig = trap_pfault(td, tf); break; case T_FILL: sig = rwindow_load(td, tf, 2); break; case T_FILL_RET: sig = rwindow_load(td, tf, 1); break; case T_SPILL: sig = rwindow_save(td); break; default: if (tf->tf_type < 0 || tf->tf_type >= T_MAX || trap_sig[tf->tf_type] == -1) panic("trap: bad trap type"); sig = trap_sig[tf->tf_type]; break; } if (sig != 0) { /* Translate fault for emulators. */ if (p->p_sysent->sv_transtrap != NULL) { sig = p->p_sysent->sv_transtrap(sig, tf->tf_type); } if (debugger_on_signal && (sig == 4 || sig == 10 || sig == 11)) Debugger("trapsig"); trapsignal(td, sig, tf->tf_type); } userret(td, tf, sticks); mtx_assert(&Giant, MA_NOTOWNED); #ifdef DIAGNOSTIC cred_free_thread(td); #endif } else { KASSERT((tf->tf_type & T_KERNEL) != 0, ("trap: kernel trap isn't")); switch (tf->tf_type & ~T_KERNEL) { #ifdef DDB case T_BREAKPOINT: case T_KSTACK_FAULT: error = (kdb_trap(tf) == 0); break; #ifdef notyet case T_PA_WATCHPOINT: case T_VA_WATCHPOINT: error = db_watch_trap(tf); break; #endif #endif case T_DATA_MISS: case T_DATA_PROTECTION: case T_INSTRUCTION_MISS: error = trap_pfault(td, tf); break; case T_DATA_EXCEPTION: case T_MEM_ADDRESS_NOT_ALIGNED: if ((tf->tf_sfsr & MMU_SFSR_FV) != 0 && MMU_SFSR_GET_ASI(tf->tf_sfsr) == ASI_AIUP) { if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } } error = 1; break; default: error = 1; break; } if (error != 0) panic("trap: %s", trap_msg[tf->tf_type & ~T_KERNEL]); } CTR1(KTR_TRAP, "trap: td=%p return", td); } static int trap_pfault(struct thread *td, struct trapframe *tf) { struct vmspace *vm; struct pcb *pcb; struct proc *p; vm_offset_t va; vm_prot_t prot; u_long ctx; int flags; int type; int rv; if (td == NULL) return (-1); KASSERT(td->td_pcb != NULL, ("trap_pfault: pcb NULL")); KASSERT(td->td_proc != NULL, ("trap_pfault: curproc NULL")); KASSERT(td->td_proc->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); p = td->td_proc; rv = KERN_SUCCESS; ctx = TLB_TAR_CTX(tf->tf_tar); pcb = td->td_pcb; type = tf->tf_type & ~T_KERNEL; va = TLB_TAR_VA(tf->tf_tar); CTR4(KTR_TRAP, "trap_pfault: td=%p pm_ctx=%#lx va=%#lx ctx=%#lx", td, p->p_vmspace->vm_pmap.pm_context[PCPU_GET(cpuid)], va, ctx); if (type == T_DATA_PROTECTION) { prot = VM_PROT_WRITE; flags = VM_FAULT_DIRTY; } else { if (type == T_DATA_MISS) prot = VM_PROT_READ; else prot = VM_PROT_READ | VM_PROT_EXECUTE; flags = VM_FAULT_NORMAL; } if (ctx != TLB_CTX_KERNEL) { if ((tf->tf_tstate & TSTATE_PRIV) != 0 && (tf->tf_tpc >= (u_long)fs_nofault_intr_begin && tf->tf_tpc <= (u_long)fs_nofault_intr_end)) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } /* * This is a fault on non-kernel virtual memory. */ vm = p->p_vmspace; /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page. */ rv = vm_fault(&vm->vm_map, va, prot, flags); /* * Now the process can be swapped again. */ PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * This is a fault on kernel virtual memory. Attempts to * access kernel memory from user mode cause privileged * action traps, not page fault. */ KASSERT(tf->tf_tstate & TSTATE_PRIV, ("trap_pfault: fault on nucleus context from user mode")); /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(kernel_map, va, prot, VM_FAULT_NORMAL); } CTR3(KTR_TRAP, "trap_pfault: return td=%p va=%#lx rv=%d", td, va, rv); if (rv == KERN_SUCCESS) return (0); if (ctx != TLB_CTX_KERNEL && (tf->tf_tstate & TSTATE_PRIV) != 0) { if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } /* Maximum number of arguments that can be passed via the out registers. */ #define REG_MAXARGS 6 /* * Syscall handler. The arguments to the syscall are passed in the o registers * by the caller, and are saved in the trap frame. The syscall number is passed * in %g1 (and also saved in the trap frame). */ void syscall(struct trapframe *tf) { struct sysent *callp; struct thread *td; register_t args[8]; register_t *argp; struct proc *p; u_int sticks; u_long code; u_long tpc; int reg; int regcnt; int narg; int error; td = PCPU_GET(curthread); KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); p = td->td_proc; atomic_add_int(&cnt.v_syscall, 1); narg = 0; error = 0; reg = 0; regcnt = REG_MAXARGS; sticks = td->td_sticks; td->td_frame = tf; if (td->td_ucred != p->p_ucred) cred_update_thread(td); - if (p->p_flag & P_THREADED) + if (p->p_flag & P_SA) thread_user_enter(p, td); code = tf->tf_global[1]; /* * For syscalls, we don't want to retry the faulting instruction * (usually), instead we need to advance one instruction. */ tpc = tf->tf_tpc; TF_DONE(tf); if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ #if 0 (*p->p_sysent->sv_prepsyscall)(tf, args, &code, ¶ms); #endif } else if (code == SYS_syscall || code == SYS___syscall) { code = tf->tf_out[reg++]; regcnt--; } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; if (narg <= regcnt) { argp = &tf->tf_out[reg]; error = 0; } else { KASSERT(narg <= sizeof(args) / sizeof(args[0]), ("Too many syscall arguments!")); argp = args; bcopy(&tf->tf_out[reg], args, sizeof(args[0]) * regcnt); error = copyin((void *)(tf->tf_out[6] + SPOFF + offsetof(struct frame, fr_pad[6])), &args[regcnt], (narg - regcnt) * sizeof(args[0])); } CTR5(KTR_SYSC, "syscall: td=%p %s(%#lx, %#lx, %#lx)", td, syscallnames[code], argp[0], argp[1], argp[2]); /* * Try to run the syscall without the MP lock if the syscall * is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, argp); #endif if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; STOPEVENT(p, S_SCE, narg); /* MP aware */ error = (*callp->sy_call)(td, argp); CTR5(KTR_SYSC, "syscall: p=%p error=%d %s return %#lx %#lx ", p, error, syscallnames[code], td->td_retval[0], td->td_retval[1]); } /* * MP SAFE (we may or may not have the MP lock at this point) */ switch (error) { case 0: tf->tf_out[0] = td->td_retval[0]; tf->tf_out[1] = td->td_retval[1]; tf->tf_tstate &= ~TSTATE_XCC_C; break; case ERESTART: /* * Undo the tpc advancement we have done above, we want to * reexecute the system call. */ tf->tf_tpc = tpc; tf->tf_tnpc -= 4; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } tf->tf_out[0] = error; tf->tf_tstate |= TSTATE_XCC_C; break; } /* * Release Giant if we had to get it. Don't use mtx_owned(), * we want to catch broken syscalls. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); /* * Handle reschedule and other end-of-syscall issues */ userret(td, tf, sticks); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); #ifdef DIAGNOSTIC cred_free_thread(td); #endif WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 116360) +++ head/sys/sys/proc.h (revision 116361) @@ -1,933 +1,933 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifndef _KERNEL #include #endif #include #include #include #include #include /* XXX */ #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { int s_count; /* (m) Ref cnt; pgrps in session. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct tty *s_ttyp; /* (m) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Pgrp id. */ int pg_jobc; /* (m) job cntl proc count */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by sched_lock mtx * k - only accessed by curthread * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * p - select lock (sellock) * r - p_peers lock * x - created at fork, only changes during single threading in exec * z - zombie threads/kse/ksegroup lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct ithd; struct ke_sched; struct kg_sched; struct nlminfo; struct p_sched; struct td_sched; struct trapframe; /* * Here we define the four structures used for process information. * * The first is the thread. It might be though of as a "Kernel * Schedulable Entity Context". * This structure contains all the information as to where a thread of * execution is now, or was when it was suspended, why it was suspended, * and anything else that will be needed to restart it when it is * rescheduled. Always associated with a KSE when running, but can be * reassigned to an equivalent KSE when being restarted for * load balancing. Each of these is associated with a kernel stack * and a pcb. * * It is important to remember that a particular thread structure only * exists as long as the system call or kernel entrance (e.g. by pagefault) * which it is currently executing. It should therefore NEVER be referenced * by pointers in long lived structures that live longer than a single * request. If several threads complete their work at the same time, * they will all rewind their stacks to the user boundary, report their * completion state, and all but one will be freed. That last one will * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel * entrance. (basically to save freeing and then re-allocating it) The KSE * keeps a cached thread available to allow it to quickly * get one when it needs a new one. There is also a system * cache of free threads. Threads have priority and partake in priority * inheritance schemes. */ struct thread; /* * The second structure is the Kernel Schedulable Entity. (KSE) * It represents the ability to take a slot in the scheduler queue. * As long as this is scheduled, it could continue to run any threads that * are assigned to the KSEGRP (see later) until either it runs out * of runnable threads of high enough priority, or CPU. * It runs on one CPU and is assigned a quantum of time. When a thread is * blocked, The KSE continues to run and will search for another thread * in a runnable state amongst those it has. It May decide to return to user * mode with a new 'empty' thread if there are no runnable threads. * Threads are temporarily associated with a KSE for scheduling reasons. */ struct kse; /* * The KSEGRP is allocated resources across a number of CPUs. * (Including a number of CPUxQUANTA. It parcels these QUANTA up among * its KSEs, each of which should be running in a different CPU. * BASE priority and total available quanta are properties of a KSEGRP. * Multiple KSEGRPs in a single process compete against each other * for total quanta in the same way that a forked child competes against * it's parent process. */ struct ksegrp; /* * A process is the owner of all system resources allocated to a task * except CPU quanta. * All KSEGs under one process see, and have the same access to, these * resources (e.g. files, memory, sockets, permissions kqueues). * A process may compete for CPU cycles on the same basis as a * forked process cluster by spawning several KSEGRPs. */ struct proc; /*************** * In pictures: With a single run queue used by all processors: RUNQ: --->KSE---KSE--... SLEEPQ:[]---THREAD---THREAD---THREAD | / []---THREAD KSEG---THREAD--THREAD--THREAD [] []---THREAD---THREAD (processors run THREADs from the KSEG until they are exhausted or the KSEG exhausts its quantum) With PER-CPU run queues: KSEs on the separate run queues directly They would be given priorities calculated from the KSEG. * *****************/ /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * The first KSE available in the correct group will run this thread. * If several are available, use the one on the same CPU as last time. * When waiting to be run, threads are hung off the KSEGRP in priority order. * with N runnable and queued KSEs in the KSEGRP, the first N threads * are linked to them. Other threads are not yet assigned. */ struct thread { struct proc *td_proc; /* (*) Associated process. */ struct ksegrp *td_ksegrp; /* (*) Associated KSEG. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc */ TAILQ_ENTRY(thread) td_kglist; /* (*) All threads in this ksegrp */ /* The two queues below should someday be merged */ TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. XXXKSE */ TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. XXXKSE */ TAILQ_ENTRY(thread) td_runq; /* (j/z) Run queue(s). XXXKSE */ TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ /* Cleared during fork1() or thread_sched_upcall() */ #define td_startzero td_flags int td_flags; /* (j) TDF_* flags. */ int td_inhibitors; /* (j) Why can not run */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ struct kse *td_last_kse; /* (j) Previous value of td_kse */ struct kse *td_kse; /* (j) Current KSE if running. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ void *td_wchan; /* (j) Sleep address. */ const char *td_wmesg; /* (j) Reason for sleep. */ u_char td_lastcpu; /* (j) Last cpu we were on. */ u_char td_oncpu; /* (j) Which cpu we are on. */ short td_locks; /* (k) DEBUG: lockmgr count of locks */ struct mtx *td_blocked; /* (j) Mutex process is blocked on. */ struct ithd *td_ithd; /* (b) For interrupt threads only. */ const char *td_lockname; /* (j) Name of lock blocked on. */ LIST_HEAD(, mtx) td_contested; /* (j) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address */ struct ucred *td_ucred; /* (k) Reference to credentials. */ void (*td_switchin)(void); /* (k) Switchin special func. */ struct thread *td_standin; /* (*) Use this for an upcall */ u_int td_prticks; /* (*) Profclock hits in sys for user */ struct kse_upcall *td_upcall; /* (*) Upcall structure. */ u_int64_t td_sticks; /* (j) Statclock hits in system mode. */ u_int td_uuticks; /* (*) Statclock hits in user, for UTS */ u_int td_usticks; /* (*) Statclock hits in kernel, for UTS */ u_int td_critnest; /* (k) Critical section nest level. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ sigset_t td_sigmask; /* (c) Current signal mask. */ sigset_t td_siglist; /* (c) Sigs arrived, not delivered. */ TAILQ_ENTRY(thread) td_umtx; /* (c?) Link for when we're blocked. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall() */ #define td_startcopy td_endzero u_char td_base_pri; /* (j) Thread base kernel priority. */ u_char td_priority; /* (j) Thread active priority. */ #define td_endcopy td_pcb /* * fields that must be manually set in fork1() or thread_sched_upcall() * or already have been set in the allocator, contstructor, etc.. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; register_t td_retval[2]; /* (k) Syscall aux returns. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack */ struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */ vm_offset_t td_altkstack; /* (a) Kernel VA of alternate kstack. */ int td_altkstack_pages; /* (a) Size of the alternate kstack */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler specific data */ }; /* flags kept in td_flags */ #define TDF_INPANIC 0x000002 /* Caused a panic, let it drive crashdump. */ #define TDF_CAN_UNBIND 0x000004 /* Only temporarily bound. */ #define TDF_SINTR 0x000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x000020 /* This is one of the per-CPU idle threads */ #define TDF_SELECT 0x000040 /* Selecting; wakeup/waiting danger. */ #define TDF_CVWAITQ 0x000080 /* Thread is on a cv_waitq (not slpq). */ #define TDF_UPCALLING 0x000100 /* This thread is doing an upcall. */ #define TDF_ONSLEEPQ 0x000200 /* On the sleep queue. */ #define TDF_ASTPENDING 0x000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x001000 /* Timeout from sleep after we were awake. */ #define TDF_INTERRUPT 0x002000 /* Thread is marked as interrupted. */ #define TDF_USTATCLOCK 0x004000 /* Stat clock hits in userland. */ #define TDF_OWEUPC 0x008000 /* Owe thread an addupc() call at next AST. */ #define TDF_NEEDRESCHED 0x010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x020000 /* Thread may need signal delivery. */ #define TDF_DEADLKTREAT 0x800000 /* Lock aquisition - deadlock treatment. */ /* "private" flags kept in td_pflags */ #define TDP_OLDMASK 0x0001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x0002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x0004 /* Thread is currently in KTRACE code. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem.. bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_CAN_UNBIND(td) \ (((td)->td_flags & TDF_CAN_UNBIND) == TDF_CAN_UNBIND && \ ((td)->td_upcall != NULL)) #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) do {(td)->td_state = TDS_RUNNING; } while (0) #define TD_SET_RUNQ(td) do {(td)->td_state = TDS_RUNQ; } while (0) #define TD_SET_CAN_RUN(td) do {(td)->td_state = TDS_CAN_RUN; } while (0) #define TD_SET_ON_SLEEPQ(td) do {(td)->td_flags |= TDF_ONSLEEPQ; } while (0) #define TD_CLR_ON_SLEEPQ(td) do { \ (td)->td_flags &= ~TDF_ONSLEEPQ; \ (td)->td_wchan = NULL; \ } while (0) /* * The schedulable entity that can be given a context to run. * A process may have several of these. Probably one per processor * but posibly a few more. In this universe they are grouped * with a KSEG that contains the priority and niceness * for the group. */ struct kse { struct proc *ke_proc; /* (*) Associated process. */ struct ksegrp *ke_ksegrp; /* (*) Associated KSEG. */ TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of KSEs in ke_ksegrp. */ TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of KSEs in this state. */ TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ #define ke_startzero ke_flags int ke_flags; /* (j) KEF_* flags. */ struct thread *ke_thread; /* (*) Active associated thread. */ fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ u_char ke_oncpu; /* (j) Which cpu we are on. */ char ke_rqindex; /* (j) Run queue index. */ enum { KES_UNUSED = 0x0, KES_IDLE, KES_ONRUNQ, KES_UNQUEUED, /* in transit */ KES_THREAD /* slaved to thread state */ } ke_state; /* (j) KSE status. */ #define ke_endzero ke_dummy u_char ke_dummy; struct ke_sched *ke_sched; /* (*) Scheduler specific data */ }; /* flags kept in ke_flags */ #define KEF_DIDRUN 0x02000 /* KSE actually ran. */ #define KEF_EXIT 0x04000 /* KSE is being killed. */ /* * The upcall management structure. * The upcall is used when returning to userland. If a thread does not have * an upcall on return to userland the thread exports its context and exits. */ struct kse_upcall { TAILQ_ENTRY(kse_upcall) ku_link; /* List of upcalls in KSEG. */ struct ksegrp *ku_ksegrp; /* Associated KSEG. */ struct thread *ku_owner; /* owning thread */ int ku_flags; /* KUF_* flags. */ struct kse_mailbox *ku_mailbox; /* userland mailbox address. */ stack_t ku_stack; /* userland upcall stack. */ void *ku_func; /* userland upcall function. */ unsigned int ku_mflags; /* cached upcall mailbox flags */ }; #define KUF_DOUPCALL 0x00001 /* Do upcall now, don't wait */ #define KUF_EXITING 0x00002 /* Upcall structure is exiting */ /* * Kernel-scheduled entity group (KSEG). The scheduler considers each KSEG to * be an indivisible unit from a time-sharing perspective, though each KSEG may * contain multiple KSEs. */ struct ksegrp { struct proc *kg_proc; /* (*) Process that contains this KSEG. */ TAILQ_ENTRY(ksegrp) kg_ksegrp; /* (*) Queue of KSEGs in kg_proc. */ TAILQ_HEAD(, kse) kg_kseq; /* (ke_kglist) All KSEs. */ TAILQ_HEAD(, kse) kg_iq; /* (ke_kgrlist) All idle KSEs. */ TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */ TAILQ_HEAD(, thread) kg_runq; /* (td_runq) waiting RUNNABLE threads */ TAILQ_HEAD(, thread) kg_slpq; /* (td_runq) NONRUNNABLE threads. */ TAILQ_HEAD(, kse_upcall) kg_upcalls; /* All upcalls in the group */ #define kg_startzero kg_estcpu u_int kg_estcpu; /* (j) Sum of the same field in KSEs. */ u_int kg_slptime; /* (j) How long completely blocked. */ struct thread *kg_last_assigned; /* (j) Last thread assigned to a KSE. */ int kg_runnable; /* (j) Num runnable threads on queue. */ int kg_runq_kses; /* (j) Num KSEs on runq. */ int kg_idle_kses; /* (j) Num KSEs on iq */ int kg_numupcalls; /* (j) Num upcalls */ int kg_upsleeps; /* (c) Num threads in kse_release() */ struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. */ int kg_nextupcall; /* (*) Next upcall time */ int kg_upquantum; /* (*) Quantum to schedule an upcall */ #define kg_endzero kg_pri_class #define kg_startcopy kg_endzero u_char kg_pri_class; /* (j) Scheduling class. */ u_char kg_user_pri; /* (j) User pri from estcpu and nice. */ char kg_nice; /* (c + j) Process "nice" value. */ #define kg_endcopy kg_numthreads int kg_numthreads; /* (j) Num threads in total */ int kg_kses; /* (j) Num KSEs in group. */ struct kg_sched *kg_sched; /* (*) Scheduler specific data */ }; /* * The old fashionned process. May have multiple threads, KSEGRPs * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, ksegrp) p_ksegrps; /* (kg_ksegrp) All KSEGs. */ TAILQ_HEAD(, thread) p_threads; /* (td_plist) Threads. (shortcut) */ TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Ptr to open files structure. */ struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */ /* Accumulated stats for all KSEs? */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c*) Process limits. */ struct vm_object *p_upages_obj; /* (a) Upages object. */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ /*struct ksegrp p_ksegrp; struct kse p_kse; */ /* * The following don't make too much sense.. * See the td_ or ke_ versions of the same flags */ int p_flag; /* (c) P_* flags. */ int p_sflag; /* (j) PS_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* KSEs can be run */ PRS_ZOMBIE } p_state; /* (j/c) S* process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct mtx p_mtx; /* (n) Lock for this struct. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtime; /* (j) Time swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct bintime p_runtime; /* (j) Real time. */ u_int64_t p_uu; /* (j) Previous user time in usec. */ u_int64_t p_su; /* (j) Previous system time in usec. */ u_int64_t p_iu; /* (j) Previous intr time in usec. */ u_int64_t p_uticks; /* (j) Statclock hits in user mode. */ u_int64_t p_sticks; /* (j) Statclock hits in system mode. */ u_int64_t p_iticks; /* (j) Statclock hits in intr. */ int p_profthreads; /* (c) Num threads in addupc_task */ int p_maxthrwaits; /* (c) Max threads num waiters */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ sigset_t p_siglist; /* (c) Sigs not delivered to a td. */ char p_lock; /* (c) Proclock (prevent swap) count. */ struct klist p_klist; /* (c) Knotes attached to this proc. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ void *p_aioinfo; /* (?) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (c) # threads in suspended mode */ /* End area that is zeroed on creation. */ #define p_endzero p_sigstk /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero stack_t p_sigstk; /* (c) Stack ptr and on-stack flag. */ u_int p_magic; /* (b) Magic number. */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (j) Current CPU limit in seconds. */ /* End area that is copied on creation. */ #define p_endcopy p_xstat u_short p_xstat; /* (c) Exit status; also stop sig. */ int p_numthreads; /* (j) Number of threads. */ int p_numksegrps; /* (?) number of ksegrps */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ struct user *p_uarea; /* (k) Kernel VA of u-area (CPU) */ u_short p_acflag; /* (c) Accounting flags. */ struct rusage *p_ru; /* (a) Exit information. XXX */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label p_label; /* (*) Process (not subject) MAC label */ struct p_sched *p_sched; /* (*) Scheduler specific data */ }; #define p_rlimit p_limit->pl_rlimit #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU 0xff /* For when we aren't on a CPU. (SMP) */ /* Status values (p_stat). */ /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread. (*)*/ #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread in requesting to stop prof */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ -#define P_THREADED 0x08000 /* Process is using threads. */ +#define P_SA 0x08000 /* Using scheduler activations. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing */ #define P_STOPPED_SINGLE 0x80000 /* Only one thread can continue */ /* (not to user) */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ /* Should be moved to machine-dependent areas. */ #define P_COWINPROGRESS 0x400000 /* Snapshot copy-on-write in progress. */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_ALTSTACK 0x2000000 /* Have alternate signal stack. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) /* These flags are kept in p_sflag and are protected with sched_lock. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ #define PS_XCPU 0x00002 /* Exceeded CPU limit. */ #define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */ #define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */ #define PS_SWAPINREQ 0x00100 /* Swapin request due to wakeup. */ #define PS_SWAPPINGOUT 0x00200 /* Process is being swapped out. */ #define PS_SWAPPINGIN 0x04000 /* Process is being swapped in. */ #define PS_MACPEND 0x08000 /* Ast()-based MAC event pending. */ /* used only in legacy conversion code */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); MALLOC_DECLARE(M_ZOMBIE); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_KSEGRP_IN_PROC(p, kg) \ TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp) #define FOREACH_THREAD_IN_GROUP(kg, td) \ TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist) #define FOREACH_KSE_IN_GROUP(kg, ke) \ TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist) #define FOREACH_UPCALL_IN_GROUP(kg, ku) \ TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) /* XXXKSE the lines below should probably only be used in 1:1 code */ #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&p->p_threads) #define FIRST_KSEGRP_IN_PROC(p) TAILQ_FIRST(&p->p_ksegrps) #define FIRST_KSE_IN_KSEGRP(kg) TAILQ_FIRST(&kg->kg_kseq) #define FIRST_KSE_IN_PROC(p) FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p)) /* * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, * as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define SESSHOLD(s) ((s)->s_count++) #define SESSRELE(s) { \ if (--(s)->s_count == 0) \ FREE(s, M_SESSION); \ } #define STOPEVENT(p, e, v) do { \ PROC_LOCK(p); \ _STOPEVENT((p), (e), (v)); \ PROC_UNLOCK(p); \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ if ((p)->p_stops & (e)) { \ stopevent((p), (e), (v)); \ } \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) \ do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0); #define PGRP_UNLOCK_PGSIGNAL(pg) \ do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0); /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_lock++; \ if (((p)->p_sflag & PS_INMEM) == 0) \ faultin((p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (--(p)->p_lock); \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td)) /* Lock and unlock process arguments. */ #define PARGS_LOCK(p) mtx_lock(&pargs_ref_lock) #define PARGS_UNLOCK(p) mtx_unlock(&pargs_ref_lock) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx pargs_ref_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0 */ extern struct ksegrp ksegrp0; /* Primary ksegrp in proc0 */ extern struct kse kse0; /* Primary kse in proc0 */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; extern int ps_argsopen; extern int ps_showallprocs; extern int sched_quantum; /* Scheduling quantum in ticks. */ LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct proc *updateproc; /* Process slot for syncer (sic). */ extern struct uma_zone *proc_zone; extern int lastpid; struct proc *pfind(pid_t); /* Find process by id. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ void adjustrunqueue(struct thread *, int newpri); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, int, int, struct proc **); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); int leavepgrp(struct proc *p); void mi_switch(void); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_free(struct pargs *pa); void pargs_hold(struct pargs *pa); void procinit(void); void threadinit(void); void proc_linkup(struct proc *p, struct ksegrp *kg, struct kse *ke, struct thread *td); void proc_reparent(struct proc *child, struct proc *newparent); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void setrunnable(struct thread *); void setrunqueue(struct thread *); void setsugid(struct proc *p); int sigonstack(size_t sp); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void cpu_idle(void); #if !defined(__alpha__) && !defined(__powerpc__) void cpu_switch(struct thread *old, struct thread *new); void cpu_throw(struct thread *old, struct thread *new) __dead2; #else void cpu_switch(void); void cpu_throw(void) __dead2; #endif void unsleep(struct thread *); void userret(struct thread *, struct trapframe *, u_int); void cpu_exit(struct thread *); void cpu_sched_exit(struct thread *); void exit1(struct thread *, int) __dead2; void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); /* New in KSE. */ struct ksegrp *ksegrp_alloc(void); void ksegrp_free(struct ksegrp *kg); void ksegrp_stash(struct ksegrp *kg); struct kse *kse_alloc(void); void kse_free(struct kse *ke); void kse_stash(struct kse *ke); void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_setup(struct thread *td); void kse_reassign(struct kse *ke); void kse_link(struct kse *ke, struct ksegrp *kg); void kse_unlink(struct kse *ke); void ksegrp_link(struct ksegrp *kg, struct proc *p); void ksegrp_unlink(struct ksegrp *kg); void thread_signal_add(struct thread *td, int sig); void thread_signal_upcall(struct thread *td); struct thread *thread_alloc(void); void thread_exit(void) __dead2; int thread_export_context(struct thread *td); void thread_free(struct thread *td); void thread_link(struct thread *td, struct ksegrp *kg); void thread_reap(void); struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku); int thread_single(int how); #define SINGLE_NO_EXIT 0 /* values for 'how' */ #define SINGLE_EXIT 1 void thread_single_end(void); void thread_stash(struct thread *td); int thread_suspend_check(int how); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_unsuspend_one(struct thread *td); int thread_userret(struct thread *td, struct trapframe *frame); void thread_user_enter(struct proc *p, struct thread *td); void thread_wait(struct proc *p); int thread_statclock(int user); struct kse_upcall *upcall_alloc(void); void upcall_free(struct kse_upcall *ku); void upcall_link(struct kse_upcall *ku, struct ksegrp *kg); void upcall_unlink(struct kse_upcall *ku); void upcall_remove(struct thread *td); void upcall_stash(struct kse_upcall *ke); void thread_sanity_check(struct thread *td, char *); void thread_stopped(struct proc *p); void thread_switchout(struct thread *td); void thr_exit1(void); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */