Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c (revision 319872) +++ head/sys/amd64/amd64/trap.c (revision 319873) @@ -1,938 +1,940 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef KDTRACE_HOOKS #include #endif extern void __noinline trap(struct trapframe *frame); extern void trap_check(struct trapframe *frame); extern void syscall(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; ksiginfo_t ksi; VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Emulator can take care about this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) goto userout; addr = frame->tf_addr; i = trap_pfault(frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); goto userout; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE); goto out; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); goto out; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); goto out; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out; } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; goto out; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; goto out; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gsbase) { frame->tf_rip = (long)gsbase_load_fault; goto out; } if (frame->tf_rip == (long)ld_fsbase) { frame->tf_rip = (long)fsbase_load_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: nmi_handle_intr(type, frame); goto out; #endif /* DEV_ISA */ } trap_fatal(frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n"); printf("rip = 0x%lx\n", frame->tf_rip); printf("rsp = 0x%lx\n", frame->tf_rsp); printf("rbp = 0x%lx\n", frame->tf_rbp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; register_t *argp; + struct syscall_args *sa; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; reg = 0; regcnt = 6; params = (caddr_t)frame->tf_rsp + sizeof(register_t); sa->code = frame->tf_rax; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = &frame->tf_rdi; argp += reg; bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); error = copyin(params, &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { - struct syscall_args sa; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!TRAPF_USERMODE(td->td_frame)) { panic("syscall"); /* NOT REACHED */ } #endif - error = syscallenter(td, &sa); + error = syscallenter(td); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", - syscallname(td->td_proc, sa.code))); + syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", - syscallname(td->td_proc, sa.code))); + syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_md.md_invl_gen.gen == 0, ("System call %s returning with leaked invl_gen %lu", - syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen)); + syscallname(td->td_proc, td->td_sa.code), + td->td_md.md_invl_gen.gen)); - syscallret(td, error, &sa); + syscallret(td, error); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } Index: head/sys/amd64/cloudabi32/cloudabi32_sysvec.c =================================================================== --- head/sys/amd64/cloudabi32/cloudabi32_sysvec.c (revision 319872) +++ head/sys/amd64/cloudabi32/cloudabi32_sysvec.c (revision 319873) @@ -1,231 +1,235 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; extern unsigned long ia32_maxssiz; static int cloudabi32_fixup_tcb(register_t **stack_base, struct image_params *imgp) { int error; uint32_t args[2]; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi32_fixup(stack_base, imgp); if (error != 0) return (error); /* * On i386, the TCB is referred to by %gs:0. Reuse the empty * space normally used by the return address (args[0]) to store * a single element array, containing a pointer to the TCB. %gs * base will point to this. * * Also let the first argument of the entry point (args[1]) * refer to the auxiliary vector, which is stored right after * the TCB. */ args[0] = (uintptr_t)*stack_base; args[1] = (uintptr_t)*stack_base + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); *stack_base -= howmany(sizeof(args), sizeof(register_t)); return (copyout(args, *stack_base, sizeof(args))); } static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { ia32_setregs(td, imgp, stack); (void)cpu_set_user_tls(td, (void *)stack); } static int -cloudabi32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cloudabi32_fetch_syscall_args(struct thread *td) { - struct trapframe *frame = td->td_frame; + struct trapframe *frame; + struct syscall_args *sa; int error; + + frame = td->td_frame; + sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_rax; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* * Fetch system call arguments. * * The vDSO has already made sure that the arguments are * eight-byte aligned. Pointers and size_t parameters are * zero-extended. This makes it possible to copy in the * arguments directly. As long as the call doesn't use 32-bit * data structures, we can just invoke the same system call * implementation used by 64-bit processes. */ error = copyin((void *)frame->tf_rcx, sa->args, sa->narg * sizeof(sa->args[0])); if (error != 0) return (error); /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* * System call succeeded. * * Simply copy out the 64-bit return values into the * same buffer provided for system call arguments. The * vDSO will copy them to the right spot, truncating * pointers and size_t values to 32 bits. */ frame->tf_rax = copyout(td->td_retval, (void *)frame->tf_rcx, sizeof(td->td_retval)) == 0 ? 0 : CLOUDABI_EFAULT; break; case ERESTART: /* Restart system call. */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_rax = cloudabi_convert_errno(error); break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; register_t retval[2]; /* Return values for processes returning from fork. */ if ((td->td_pflags & TDP_FORKING) != 0) { retval[0] = CLOUDABI_PROCESS_CHILD; retval[1] = td->td_tid; copyout(retval, (void *)frame->tf_rcx, sizeof(retval)); } } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { stack_t stack; uint32_t args[3]; void *frameptr; int error; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len - sizeof(args); cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Copy the arguments for the thread entry point onto the stack * (args[1] and args[2]). Similar to process startup, use the * otherwise unused return address (args[0]) for TLS. */ args[0] = tcb; args[1] = td->td_tid; args[2] = attr->argument; frameptr = (void *)td->td_frame->tf_rsp; error = copyout(args, frameptr, sizeof(args)); if (error != 0) return (error); return (cpu_set_user_tls(td, frameptr)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup_tcb, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_pagesize = IA32_PAGE_SIZE, .sv_minuser = FREEBSD32_MINUSER, .sv_maxuser = FREEBSD32_MAXUSER, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_fixlimit = ia32_fixlimit, .sv_maxssiz = &ia32_maxssiz, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_386, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; Index: head/sys/amd64/cloudabi64/cloudabi64_sysvec.c =================================================================== --- head/sys/amd64/cloudabi64/cloudabi64_sysvec.c (revision 319872) +++ head/sys/amd64/cloudabi64/cloudabi64_sysvec.c (revision 319873) @@ -1,216 +1,220 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi64_syscallnames[]; extern struct sysent cloudabi64_sysent[]; static int cloudabi64_fixup_tcb(register_t **stack_base, struct image_params *imgp) { int error; register_t tcbptr; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi64_fixup(stack_base, imgp); if (error != 0) return (error); /* * On x86-64, the TCB is referred to by %fs:0. Take some space * from the top of the stack to store a single element array, * containing a pointer to the TCB. %fs base will point to this. */ tcbptr = (register_t)*stack_base; return (copyout(&tcbptr, --*stack_base, sizeof(tcbptr))); } static void cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB, the TCB itself, * and the auxiliary vector. Let %rdx point to the auxiliary * vector, and set %fs base to the address of the TCB. */ regs = td->td_frame; regs->tf_rdi = stack + sizeof(register_t) + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, (void *)stack); } static int -cloudabi64_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cloudabi64_fetch_syscall_args(struct thread *td) { - struct trapframe *frame = td->td_frame; + struct trapframe *frame; + struct syscall_args *sa; + + frame = td->td_frame; + sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_rax; if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi64_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments. */ sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; /* Actually %r10. */ sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void cloudabi64_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_rax = td->td_retval[0]; frame->tf_rdx = td->td_retval[1]; frame->tf_rflags &= ~PSL_C; break; case ERESTART: /* Restart system call. */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_rax = cloudabi_convert_errno(error); frame->tf_rflags |= PSL_C; break; } } static void cloudabi64_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* Initial register values for processes returning from fork. */ frame->tf_rax = CLOUDABI_PROCESS_CHILD; frame->tf_rdx = td->td_tid; } int cloudabi64_thread_setregs(struct thread *td, const cloudabi64_threadattr_t *attr, uint64_t tcb) { struct trapframe *frame; stack_t stack; uint64_t tcbptr; int error; /* * On x86-64, the TCB is referred to by %fs:0. Take some space * from the top of the stack to store a single element array, * containing a pointer to the TCB. %fs base will point to this. */ tcbptr = rounddown(attr->stack + attr->stack_len - sizeof(tcbptr), _Alignof(tcbptr)); error = copyout(&tcb, (void *)tcbptr, sizeof(tcb)); if (error != 0) return (error); /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = tcbptr - attr->stack; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_rdi = td->td_tid; frame->tf_rsi = attr->argument; return (cpu_set_user_tls(td, (void *)tcbptr)); } static struct sysentvec cloudabi64_elf_sysvec = { .sv_size = CLOUDABI64_SYS_MAXSYSCALL, .sv_table = cloudabi64_sysent, .sv_fixup = cloudabi64_fixup_tcb, .sv_name = "CloudABI ELF64", .sv_coredump = elf64_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi64_copyout_strings, .sv_setregs = cloudabi64_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64, .sv_set_syscall_retval = cloudabi64_set_syscall_retval, .sv_fetch_syscall_args = cloudabi64_fetch_syscall_args, .sv_syscallnames = cloudabi64_syscallnames, .sv_schedtail = cloudabi64_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec); Elf64_Brandinfo cloudabi64_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_X86_64, .sysvec = &cloudabi64_elf_sysvec, .flags = BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC, }; Index: head/sys/amd64/ia32/ia32_syscall.c =================================================================== --- head/sys/amd64/ia32/ia32_syscall.c (revision 319872) +++ head/sys/amd64/ia32/ia32_syscall.c (revision 319873) @@ -1,250 +1,251 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd); void ia32_syscall(struct trapframe *frame); /* Called from asm code */ void ia32_set_syscall_retval(struct thread *td, int error) { cpu_set_syscall_retval(td, error); } int -ia32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +ia32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; + struct syscall_args *sa; caddr_t params; u_int32_t args[8], tmp; int error, i; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t); sa->code = frame->tf_rax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword32(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. * We use a 32-bit fetch in case params is not * aligned. */ error = fueword32(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (params != NULL && sa->narg != 0) error = copyin(params, (caddr_t)args, (u_int)(sa->narg * sizeof(int))); else error = 0; for (i = 0; i < sa->narg; i++) sa->args[i] = args[i]; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" void ia32_syscall(struct trapframe *frame) { struct thread *td; - struct syscall_args sa; register_t orig_tf_rflags; int error; ksiginfo_t ksi; orig_tf_rflags = frame->tf_rflags; td = curthread; td->td_frame = frame; - error = syscallenter(td, &sa); + error = syscallenter(td); /* * Traced syscall. */ if (orig_tf_rflags & PSL_T) { frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_rip; trapsignal(td, &ksi); } - syscallret(td, error, &sa); + syscallret(td, error); } static void ia32_syscall_enable(void *dummy) { setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); } static void ia32_syscall_disable(void *dummy) { setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); } SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL); SYSUNINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_disable, NULL); #ifdef COMPAT_43 int setup_lcall_gate(void) { struct i386_ldt_args uap; struct user_segment_descriptor desc; uint32_t lcall_addr; int error; bzero(&uap, sizeof(uap)); uap.start = 0; uap.num = 1; lcall_addr = curproc->p_sysent->sv_psstrings - sz_lcall_tramp; bzero(&desc, sizeof(desc)); desc.sd_type = SDT_MEMERA; desc.sd_dpl = SEL_UPL; desc.sd_p = 1; desc.sd_def32 = 1; desc.sd_gran = 1; desc.sd_lolimit = 0xffff; desc.sd_hilimit = 0xf; desc.sd_lobase = lcall_addr; desc.sd_hibase = lcall_addr >> 24; error = amd64_set_ldt(curthread, &uap, &desc); if (error != 0) return (error); return (0); } #endif Index: head/sys/amd64/linux/linux_sysvec.c =================================================================== --- head/sys/amd64/linux/linux_sysvec.c (revision 319872) +++ head/sys/amd64/linux/linux_sysvec.c (revision 319873) @@ -1,990 +1,992 @@ /*- * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux64, 1); #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif #if defined(DEBUG) SYSCTL_PROC(_compat_linux, OID_AUTO, debug, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, linux_sysctl_debug, "A", "Linux 64 debugging control"); #endif /* * Allow the this functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 const char *linux_kplatform; static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux_locore_o_start; extern char _binary_linux_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static register_t * linux_copyout_strings(struct image_params *imgp); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); static void linux_set_syscall_retval(struct thread *td, int error); -static int linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa); +static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, u_long stack); static int linux_vsyscall(struct thread *td); /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc; frame = td->td_frame; + sa = &td->td_sa; sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; /* * On Linux only %rcx and %r11 values are not preserved across * the syscall. * So, do not clobber %rdx and %r10 */ td->td_retval[1] = frame->tf_rdx; frame->tf_r10 = frame->tf_rcx; cpu_set_syscall_retval(td, error); /* Restore all registers. */ set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } static int elf_linux_fixup(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args; Elf_Addr *base; Elf_Addr *pos; struct ps_strings *arginfo; struct proc *p; int issetugid; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; KASSERT(curthread->td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); base = (Elf64_Addr *)*stack_base; args = (Elf64_Auxargs *)imgp->auxargs; pos = base + (imgp->args->argc + imgp->args->envc + 2); issetugid = p->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (uint64_t)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* * Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (caddr_t)arginfo - SPARE_USRSPACE - roundup(sizeof(canary), sizeof(char *)) - roundup(execpath_len, sizeof(char *)) - roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *)); if (execpath_len != 0) { imgp->execpathp = (uintptr_t)arginfo - execpath_len; copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); imgp->canary = (uintptr_t)arginfo - roundup(execpath_len, sizeof(char *)) - roundup(sizeof(canary), sizeof(char *)); copyout(canary, (void *)imgp->canary, sizeof(canary)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; set_pcb_flags(pcb, PCB_FULL_IRET); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } /* * Copied from amd64/amd64/machdep.c * * XXX fpu state need? don't think so */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_ucontext uc; struct l_sigcontext *context; struct trapframe *regs; unsigned long rflags; int error; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); if (error != 0) return (error); p = td->td_proc; context = &uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; int sig, code; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe); } else sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); mtx_unlock(&psp->ps_mtx); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Save user context. */ bzero(&sf, sizeof(sf)); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ sf.sf_handler = catcher; /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &sf.sf_si, sig); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds * attempt to use the alternate path for the interpreter. * If an alternate path is found, use our stringspace * to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } #define LINUX_VSYSCALL_START (-10UL << 20) #define LINUX_VSYSCALL_SZ 1024 const unsigned long linux_vsyscall_vector[] = { LINUX_SYS_gettimeofday, LINUX_SYS_linux_time, /* getcpu not implemented */ }; static int linux_vsyscall(struct thread *td) { struct trapframe *frame; uint64_t retqaddr; int code, traced; int error; frame = td->td_frame; /* Check %rip for vsyscall area */ if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) return (EINVAL); if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) return (EINVAL); code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; if (code >= nitems(linux_vsyscall_vector)) return (EINVAL); /* * vsyscall called as callq *(%rax), so we must * use return address from %rsp and also fixup %rsp */ error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); if (error) return (error); frame->tf_rip = retqaddr; frame->tf_rax = linux_vsyscall_vector[code]; frame->tf_rsp += 8; traced = (frame->tf_flags & PSL_T); amd64_syscall(td, traced); return (0); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux_locore_o_end - &_binary_linux_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec, SHAREDPAGE); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; linux_kplatform = linux_shared_page_mapping + (linux_platform - (caddr_t)SHAREDPAGE); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj); }; SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)linux_vdso_deinstall, NULL); static char GNULINUX_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNULINUX_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib64/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk64", NULL, MTX_DEF); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux x86-64 ELF exec handler installed\n"); } else printf("cannot insert Linux x86-64 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "Linux 64bit support"); Index: head/sys/amd64/linux32/linux32_sysvec.c =================================================================== --- head/sys/amd64/linux32/linux32_sysvec.c (revision 319872) +++ head/sys/amd64/linux32/linux32_sysvec.c (revision 319873) @@ -1,1208 +1,1210 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define AUXARGS_ENTRY_32(pos, id, val) \ do { \ suword32(pos++, id); \ suword32(pos++, val); \ } while (0) #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX32_SYS_linux_rt_sendsig 0 #define LINUX32_SYS_linux_sendsig 0 const char *linux_kplatform; static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux32_locore_o_start; extern char _binary_linux32_locore_o_end; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static register_t *linux_copyout_strings(struct image_params *imgp); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static void linux32_fixlimit(struct rlimit *rl, int which); static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); base = (Elf32_Addr *)*stack_base; args = (Elf32_Auxargs *)imgp->auxargs; pos = base + (imgp->args->argc + imgp->args->envc + 2); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall); AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY_32(pos, AT_BASE, args->base); AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary)); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp)); if (args->execfd != -1) AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY_32(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword32(base, (uint32_t)imgp->args->argc); *stack_base = (register_t *)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn * and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; lmask.__mask = frame.sf_extramask[0]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int -linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; + struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an * alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = imgp->ps_strings; fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); td->td_retval[1] = 0; } /* * XXX copied from ia32_sysvec.c. */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; u_int32_t *vectp; char *stringp, *destp; u_int32_t *stack_base; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; /* * Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (caddr_t)arginfo - SPARE_USRSPACE - roundup(sizeof(canary), sizeof(char *)) - roundup(execpath_len, sizeof(char *)) - roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *)); if (execpath_len != 0) { imgp->execpathp = (uintptr_t)arginfo - execpath_len; copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); imgp->canary = (uintptr_t)arginfo - roundup(execpath_len, sizeof(char *)) - roundup(sizeof(canary), sizeof(char *)); copyout(canary, (void *)imgp->canary, sizeof(canary)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(u_int32_t)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (u_int32_t *)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); /* * vectp also becomes our initial stack base */ stack_base = vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword32(vectp++, 0); suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword32(vectp, 0); return ((register_t *)stack_base); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); #if defined(DEBUG) SYSCTL_PROC(_compat_linux32, OID_AUTO, debug, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, linux_sysctl_debug, "A", "Linux debugging control"); #endif static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_mask = 0, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux32_locore_o_end - &_binary_linux32_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; linux_kplatform = linux_shared_page_mapping + (linux_platform - (caddr_t)LINUX32_SHAREDPAGE); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj); }; SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); Index: head/sys/arm/arm/syscall.c =================================================================== --- head/sys/arm/arm/syscall.c (revision 319872) +++ head/sys/arm/arm/syscall.c (revision 319873) @@ -1,177 +1,178 @@ /* $NetBSD: fault.c,v 1.45 2003/11/20 14:44:36 scw Exp $ */ /*- * Copyright 2004 Olivier Houchard * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1994-1997 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * fault.c * * Fault handlers * * Created : 28/11/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include void swi_handler(struct trapframe *); int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap; + struct syscall_args *sa; int error; + sa = &td->td_sa; sa->code = td->td_frame->tf_r7; ap = &td->td_frame->tf_r0; if (sa->code == SYS_syscall) { sa->code = *ap++; sa->nap--; } else if (sa->code == SYS___syscall) { sa->code = ap[_QUAD_LOWWORD]; sa->nap -= 2; ap += 2; } p = td->td_proc; if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; error = 0; memcpy(sa->args, ap, sa->nap * sizeof(register_t)); if (sa->narg > sa->nap) { error = copyin((void *)td->td_frame->tf_usr_sp, sa->args + sa->nap, (sa->narg - sa->nap) * sizeof(register_t)); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; } return (error); } #include "../../kern/subr_syscall.c" static void syscall(struct thread *td, struct trapframe *frame) { - struct syscall_args sa; int error; - sa.nap = 4; + td->td_sa.nap = 4; - error = syscallenter(td, &sa); + error = syscallenter(td); KASSERT(error != 0 || td->td_ar == NULL, ("returning from syscall with td_ar set!")); - syscallret(td, error, &sa); + syscallret(td, error); } void swi_handler(struct trapframe *frame) { struct thread *td = curthread; td->td_frame = frame; td->td_pticks = 0; /* * Enable interrupts if they were enabled before the exception. * Since all syscalls *should* come from user mode it will always * be safe to enable them, but check anyway. */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(frame->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(frame->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } syscall(td, frame); } Index: head/sys/arm/cloudabi32/cloudabi32_sysvec.c =================================================================== --- head/sys/arm/cloudabi32/cloudabi32_sysvec.c (revision 319872) +++ head/sys/arm/cloudabi32/cloudabi32_sysvec.c (revision 319873) @@ -1,193 +1,197 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let r0 point to the auxiliary vector, and set * tpidrurw to the TCB. */ regs = td->td_frame; regs->tf_r0 = td->td_retval[0] = stack + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, (void *)stack); } static int -cloudabi32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cloudabi32_fetch_syscall_args(struct thread *td) { - struct trapframe *frame = td->td_frame; + struct trapframe *frame; + struct syscall_args *sa; int error; + + frame = td->td_frame; + sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_r12; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments from registers and the stack. */ sa->args[0] = frame->tf_r0; sa->args[1] = frame->tf_r1; sa->args[2] = frame->tf_r2; sa->args[3] = frame->tf_r3; if (sa->narg > 4) { error = copyin((void *)td->td_frame->tf_usr_sp, &sa->args[4], (sa->narg - 4) * sizeof(register_t)); if (error != 0) return (error); } /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_r1; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_r0 = td->td_retval[0]; frame->tf_r1 = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_pc -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_r0 = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_r0 = CLOUDABI_PROCESS_CHILD; frame->tf_r1 = td->td_tid; } } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_r0 = td->td_tid; frame->tf_r1 = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, (void *)tcb)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_ARM, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; Index: head/sys/arm64/arm64/trap.c =================================================================== --- head/sys/arm64/arm64/trap.c (revision 319872) +++ head/sys/arm64/arm64/trap.c (revision 319873) @@ -1,431 +1,432 @@ /*- * Copyright (c) 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif #ifdef VFP #include #endif #ifdef KDB #include #endif #ifdef DDB #include #endif extern register_t fsu_intr_fault; /* Called from exception.S */ void do_el1h_sync(struct thread *, struct trapframe *); void do_el0_sync(struct thread *, struct trapframe *); void do_el0_error(struct trapframe *); static void print_registers(struct trapframe *frame); int (*dtrace_invop_jump_addr)(struct trapframe *); static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; trapsignal(td, &ksi); } int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap; + struct syscall_args *sa; int nap; nap = 8; p = td->td_proc; ap = td->td_frame->tf_x; + sa = &td->td_sa; sa->code = td->td_frame->tf_x[8]; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = *ap++; nap--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; memcpy(sa->args, ap, nap * sizeof(register_t)); if (sa->narg > nap) panic("ARM64TODO: Could we have more than 8 args?"); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" static void svc_handler(struct thread *td, struct trapframe *frame) { - struct syscall_args sa; int error; if ((frame->tf_esr & ESR_ELx_ISS_MASK) == 0) { - error = syscallenter(td, &sa); - syscallret(td, error, &sa); + error = syscallenter(td); + syscallret(td, error); } else { call_trapsignal(td, SIGILL, ILL_ILLOPN, (void *)frame->tf_elr); userret(td, frame); } } static void data_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { struct vm_map *map; struct proc *p; struct pcb *pcb; vm_prot_t ftype; vm_offset_t va; int error, sig, ucode; /* * According to the ARMv8-A rev. A.g, B2.10.5 "Load-Exclusive * and Store-Exclusive instruction usage restrictions", state * of the exclusive monitors after data abort exception is unknown. */ clrex(); #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif pcb = td->td_pcb; /* * Special case for fuswintr and suswintr. These can't sleep so * handle them early on in the trap handler. */ if (__predict_false(pcb->pcb_onfault == (vm_offset_t)&fsu_intr_fault)) { frame->tf_elr = pcb->pcb_onfault; return; } p = td->td_proc; if (lower) map = &p->p_vmspace->vm_map; else { /* The top bit tells us which range to use */ if (far >= VM_MAXUSER_ADDRESS) { map = kernel_map; } else { map = &p->p_vmspace->vm_map; if (map == NULL) map = kernel_map; } } if (pmap_fault(map->pmap, esr, far) == KERN_SUCCESS) return; KASSERT(td->td_md.md_spinlock_count == 0, ("data abort with spinlock held")); if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("data abort in critical section or under mutex"); } va = trunc_page(far); ftype = ((esr >> 6) & 1) ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ; /* Fault in the page. */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (error != KERN_SUCCESS) { if (lower) { sig = SIGSEGV; if (error == KERN_PROTECTION_FAILURE) ucode = SEGV_ACCERR; else ucode = SEGV_MAPERR; call_trapsignal(td, sig, ucode, (void *)far); } else { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != 0) { frame->tf_x[0] = error; frame->tf_elr = pcb->pcb_onfault; return; } printf("Fatal data abort:\n"); print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(ESR_ELx_EXCEPTION(esr), 0, frame)) return; #endif panic("vm_fault failed: %lx", frame->tf_elr); } } if (lower) userret(td, frame); } static void print_registers(struct trapframe *frame) { u_int reg; for (reg = 0; reg < nitems(frame->tf_x); reg++) { printf(" %sx%d: %16lx\n", (reg < 10) ? " " : "", reg, frame->tf_x[reg]); } printf(" sp: %16lx\n", frame->tf_sp); printf(" lr: %16lx\n", frame->tf_lr); printf(" elr: %16lx\n", frame->tf_elr); printf("spsr: %8x\n", frame->tf_spsr); } void do_el1h_sync(struct thread *td, struct trapframe *frame) { uint32_t exception; uint64_t esr, far; /* Read the esr register to get the exception details */ esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR4(KTR_TRAP, "do_el1_sync: curthread: %p, esr %lx, elr: %lx, frame: %p", td, esr, frame->tf_elr, frame); switch(exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: #ifdef VFP if ((td->td_pcb->pcb_fpflags & PCB_FP_KERN) != 0) { vfp_restore_state(); } else #endif { print_registers(frame); printf(" esr: %.8lx\n", esr); panic("VFP exception in the kernel"); } break; case EXCP_INSN_ABORT: case EXCP_DATA_ABORT: far = READ_SPECIALREG(far_el1); intr_enable(); data_abort(td, frame, esr, far, 0); break; case EXCP_BRK: #ifdef KDTRACE_HOOKS if ((esr & ESR_ELx_ISS_MASK) == 0x40d && \ dtrace_invop_jump_addr != 0) { dtrace_invop_jump_addr(frame); break; } #endif /* FALLTHROUGH */ case EXCP_WATCHPT_EL1: case EXCP_SOFTSTP_EL1: #ifdef KDB kdb_trap(exception, 0, frame); #else panic("No debugger in kernel.\n"); #endif break; default: print_registers(frame); panic("Unknown kernel exception %x esr_el1 %lx\n", exception, esr); } } /* * The attempted execution of an instruction bit pattern that has no allocated * instruction results in an exception with an unknown reason. */ static void el0_excp_unknown(struct trapframe *frame, uint64_t far) { struct thread *td; td = curthread; call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far); userret(td, frame); } void do_el0_sync(struct thread *td, struct trapframe *frame) { uint32_t exception; uint64_t esr, far; /* Check we have a sane environment when entering from userland */ KASSERT((uintptr_t)get_pcpu() >= VM_MIN_KERNEL_ADDRESS, ("Invalid pcpu address from userland: %p (tpidr %lx)", get_pcpu(), READ_SPECIALREG(tpidr_el1))); esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); switch (exception) { case EXCP_UNKNOWN: case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: far = READ_SPECIALREG(far_el1); } intr_enable(); CTR4(KTR_TRAP, "do_el0_sync: curthread: %p, esr %lx, elr: %lx, frame: %p", td, esr, frame->tf_elr, frame); switch(exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: #ifdef VFP vfp_restore_state(); #else panic("VFP exception in userland"); #endif break; case EXCP_SVC: svc_handler(td, frame); break; case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: data_abort(td, frame, esr, far, 1); break; case EXCP_UNKNOWN: el0_excp_unknown(frame, far); break; case EXCP_SP_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_sp); userret(td, frame); break; case EXCP_PC_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_elr); userret(td, frame); break; case EXCP_BRK: call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_elr); userret(td, frame); break; case EXCP_MSR: call_trapsignal(td, SIGILL, ILL_PRVOPC, (void *)frame->tf_elr); userret(td, frame); break; case EXCP_SOFTSTP_EL0: td->td_frame->tf_spsr &= ~PSR_SS; td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP; WRITE_SPECIALREG(MDSCR_EL1, READ_SPECIALREG(MDSCR_EL1) & ~DBG_MDSCR_SS); call_trapsignal(td, SIGTRAP, TRAP_TRACE, (void *)frame->tf_elr); userret(td, frame); break; default: call_trapsignal(td, SIGBUS, BUS_OBJERR, (void *)frame->tf_elr); userret(td, frame); break; } KASSERT((td->td_pcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, ("Kernel VFP flags set while entering userspace")); KASSERT( td->td_pcb->pcb_fpusaved == &td->td_pcb->pcb_fpustate, ("Kernel VFP state in use when entering userspace")); } void do_el0_error(struct trapframe *frame) { panic("ARM64TODO: do_el0_error"); } Index: head/sys/arm64/cloudabi64/cloudabi64_sysvec.c =================================================================== --- head/sys/arm64/cloudabi64/cloudabi64_sysvec.c (revision 319872) +++ head/sys/arm64/cloudabi64/cloudabi64_sysvec.c (revision 319873) @@ -1,185 +1,189 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi64_syscallnames[]; extern struct sysent cloudabi64_sysent[]; static void cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let x0 point to the auxiliary vector, and set * tpidr_el0 to the TCB. */ regs = td->td_frame; regs->tf_x[0] = td->td_retval[0] = stack + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, (void *)stack); } static int -cloudabi64_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cloudabi64_fetch_syscall_args(struct thread *td) { - struct trapframe *frame = td->td_frame; + struct trapframe *frame; + struct syscall_args *sa; int i; + + frame = td->td_frame; + sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_x[8]; if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi64_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments. */ for (i = 0; i < MAXARGS; i++) sa->args[i] = frame->tf_x[i]; /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_x[1]; return (0); } static void cloudabi64_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_x[0] = td->td_retval[0]; frame->tf_x[1] = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_elr -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_x[0] = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi64_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_x[0] = CLOUDABI_PROCESS_CHILD; frame->tf_x[1] = td->td_tid; } } int cloudabi64_thread_setregs(struct thread *td, const cloudabi64_threadattr_t *attr, uint64_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_x[0] = td->td_tid; frame->tf_x[1] = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, (void *)tcb)); } static struct sysentvec cloudabi64_elf_sysvec = { .sv_size = CLOUDABI64_SYS_MAXSYSCALL, .sv_table = cloudabi64_sysent, .sv_fixup = cloudabi64_fixup, .sv_name = "CloudABI ELF64", .sv_coredump = elf64_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi64_copyout_strings, .sv_setregs = cloudabi64_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64, .sv_set_syscall_retval = cloudabi64_set_syscall_retval, .sv_fetch_syscall_args = cloudabi64_fetch_syscall_args, .sv_syscallnames = cloudabi64_syscallnames, .sv_schedtail = cloudabi64_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec); Elf64_Brandinfo cloudabi64_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_AARCH64, .sysvec = &cloudabi64_elf_sysvec, .flags = BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC, }; Index: head/sys/compat/ia32/ia32_util.h =================================================================== --- head/sys/compat/ia32/ia32_util.h (revision 319872) +++ head/sys/compat/ia32/ia32_util.h (revision 319873) @@ -1,57 +1,57 @@ /*- * Copyright (c) 1998-1999 Andrew Gallatin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software withough specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _COMPAT_IA32_IA32_UTIL_H #define _COMPAT_IA32_IA32_UTIL_H #include #include #include #include #include #include #define FREEBSD32_MAXUSER ((1ul << 32) - IA32_PAGE_SIZE) #define FREEBSD32_MINUSER 0 #define FREEBSD32_SHAREDPAGE (FREEBSD32_MAXUSER - IA32_PAGE_SIZE) #define FREEBSD32_USRSTACK FREEBSD32_SHAREDPAGE #define IA32_PAGE_SIZE 4096 #define IA32_MAXDSIZ (512*1024*1024) /* 512MB */ #define IA32_MAXSSIZ (64*1024*1024) /* 64MB */ #define IA32_MAXVMEM 0 /* Unlimited */ struct syscall_args; -int ia32_fetch_syscall_args(struct thread *td, struct syscall_args *sa); +int ia32_fetch_syscall_args(struct thread *td); void ia32_set_syscall_retval(struct thread *, int); void ia32_fixlimit(struct rlimit *rl, int which); #endif /* _COMPAT_IA32_IA32_UTIL_H */ Index: head/sys/i386/cloudabi32/cloudabi32_sysvec.c =================================================================== --- head/sys/i386/cloudabi32/cloudabi32_sysvec.c (revision 319872) +++ head/sys/i386/cloudabi32/cloudabi32_sysvec.c (revision 319873) @@ -1,204 +1,208 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static int cloudabi32_fixup_tcb(register_t **stack_base, struct image_params *imgp) { int error; uint32_t args[2]; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi32_fixup(stack_base, imgp); if (error != 0) return (error); /* * On i386, the TCB is referred to by %gs:0. Reuse the empty * space normally used by the return address (args[0]) to store * a single element array, containing a pointer to the TCB. %gs * base will point to this. * * Also let the first argument of the entry point (args[1]) * refer to the auxiliary vector, which is stored right after * the TCB. */ args[0] = (uintptr_t)*stack_base; args[1] = (uintptr_t)*stack_base + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); *stack_base -= howmany(sizeof(args), sizeof(register_t)); return (copyout(args, *stack_base, sizeof(args))); } static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, unsigned long stack) { exec_setregs(td, imgp, stack); (void)cpu_set_user_tls(td, (void *)stack); } static int -cloudabi32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cloudabi32_fetch_syscall_args(struct thread *td) { - struct trapframe *frame = td->td_frame; + struct trapframe *frame; + struct syscall_args *sa; int error; + + frame = td->td_frame; + sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_eax; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; sa->narg = sa->callp->sy_narg; /* Fetch system call arguments from the stack. */ error = copyin((void *)(frame->tf_esp + 4), sa->args, sa->narg * sizeof(sa->args[0])); if (error != 0) return (error); /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_eax = td->td_retval[0]; frame->tf_edx = td->td_retval[1]; frame->tf_eflags &= ~PSL_C; break; case ERESTART: /* Restart system call. */ frame->tf_eip -= frame->tf_err; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_eax = cloudabi_convert_errno(error); frame->tf_eflags |= PSL_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* Initial register values for processes returning from fork. */ frame->tf_eax = CLOUDABI_PROCESS_CHILD; frame->tf_edx = td->td_tid; } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { stack_t stack; uint32_t args[3]; void *frameptr; int error; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len - sizeof(args); cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Copy the arguments for the thread entry point onto the stack * (args[1] and args[2]). Similar to process startup, use the * otherwise unused return address (args[0]) for TLS. */ args[0] = tcb; args[1] = td->td_tid; args[2] = attr->argument; frameptr = (void *)td->td_frame->tf_esp; error = copyout(args, frameptr, sizeof(args)); if (error != 0) return (error); return (cpu_set_user_tls(td, frameptr)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup_tcb, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_386, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 319872) +++ head/sys/i386/i386/trap.c (revision 319873) @@ -1,1124 +1,1125 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #ifdef KDTRACE_HOOKS #include #endif extern void trap(struct trapframe *frame); extern void syscall(struct trapframe *frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif static int prot_fault_translation = 0; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A non-zero return value from the * hook means that the NMI was consumed by it and that we can * return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock. */ if (type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and conditionally * reenable interrupts. If we hold a spin lock, then * we must not reenable interrupts. This might be a * spurious page fault. */ eva = rcr2(); if (td->td_md.md_spinlock_count == 0) enable_intr(); } if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif user_trctrap_out: frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = npxtrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto user_trctrap_out; } if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = npxtrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); if (npxdna()) goto out; break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto kernel_trctrap; } if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ kernel_trctrap: if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(curpcb->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); goto out; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x esp 0x%08x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_esp, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return (-2); #endif if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a user-space address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; #if defined(PAE) || defined(PAE_TABLES) else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; #endif else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", #if defined(PAE) || defined(PAE_TABLES) pg_nx != 0 ? (code & PGEX_I ? " instruction" : " data") : #endif "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if (TF_HAS_STACKREGS(frame)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) { frame->tf_err = eva; /* smuggle fault address to ddb */ if (kdb_trap(type, 0, frame)) { frame->tf_err = code; /* restore error code */ return; } frame->tf_err = code; /* restore error code */ } #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; + struct syscall_args *sa; caddr_t params; long tmp; int error; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; params = (caddr_t)frame->tf_esp + sizeof(int); sa->code = frame->tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (params != NULL && sa->narg != 0) error = copyin(params, (caddr_t)sa->args, (u_int)(sa->narg * sizeof(int))); else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; } return (error); } #include "../../kern/subr_syscall.c" /* * syscall - system call request C handler. A system call is * essentially treated as a trap by reusing the frame layout. */ void syscall(struct trapframe *frame) { struct thread *td; - struct syscall_args sa; register_t orig_tf_eflags; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!(TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { panic("syscall"); /* NOT REACHED */ } #endif orig_tf_eflags = frame->tf_eflags; td = curthread; td->td_frame = frame; - error = syscallenter(td, &sa); + error = syscallenter(td); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame->tf_eflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", - syscallname(td->td_proc, sa.code))); + syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", - syscallname(td->td_proc, sa.code))); + syscallname(td->td_proc, td->td_sa.code))); - syscallret(td, error, &sa); + syscallret(td, error); } Index: head/sys/i386/linux/linux_sysvec.c =================================================================== --- head/sys/i386/linux/linux_sysvec.c (revision 319872) +++ head/sys/i386/linux/linux_sysvec.c (revision 319873) @@ -1,1200 +1,1202 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif #if defined(DEBUG) SYSCTL_PROC(_compat_linux, OID_AUTO, debug, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, linux_sysctl_debug, "A", "Linux debugging control"); #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 #define LINUX_SYS_linux_sendsig 0 #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux_locore_o_start; extern char _binary_linux_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(register_t **stack_base, struct image_params *iparams); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static register_t *linux_copyout_strings(struct image_params *imgp); static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); static int linux_szplatform; const char *linux_kplatform; static eventhandler_tag linux_exit_tag; static eventhandler_tag linux_exec_tag; static eventhandler_tag linux_thread_dtor_tag; /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)args->argc + 1); (*stack_base)--; suword(*stack_base, (intptr_t)(void *)envp); (*stack_base)--; suword(*stack_base, (intptr_t)(void *)argv); (*stack_base)--; suword(*stack_base, imgp->args->argc); return (0); } static int elf_linux_fixup(register_t **stack_base, struct image_params *imgp) { struct proc *p; Elf32_Auxargs *args; Elf32_Addr *uplatform; struct ps_strings *arginfo; register_t *pos; int issetugid; KASSERT(curthread->td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); p = imgp->proc; issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform); args = (Elf32_Auxargs *)imgp->auxargs; pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(uplatform)); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; suword(*stack_base, (register_t)imgp->args->argc); return (0); } /* * Copied from kern/kern_exec.c */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* * Calculate string base and vector table pointers. */ p = imgp->proc; if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform - roundup(sizeof(canary), sizeof(char *)) - roundup(execpath_len, sizeof(char *)) - roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *)); /* * install LINUX_PLATFORM */ copyout(linux_kplatform, ((caddr_t)arginfo - linux_szplatform), linux_szplatform); if (execpath_len != 0) { imgp->execpathp = (uintptr_t)arginfo - linux_szplatform - execpath_len; copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); imgp->canary = (uintptr_t)arginfo - linux_szplatform - roundup(execpath_len, sizeof(char *)) - roundup(sizeof(canary), sizeof(char *)); copyout(canary, (void *)imgp->canary, sizeof(canary)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp; frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = linux_rt_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig, code; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = ksi->ksi_signo; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_esp = regs->tf_esp; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = linux_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; sigset_t bmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = lss->ss_sp; ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int -linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; + struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; sa->code = frame->tf_eax; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } static void linux_get_machine(const char **dst) { switch (cpu_class) { case CPUCLASS_686: *dst = "i686"; break; case CPUCLASS_586: *dst = "i586"; break; case CPUCLASS_486: *dst = "i486"; break; default: *dst = "i386"; } } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux_locore_o_end - &_binary_linux_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX_SHAREDPAGE); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj); }; SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit, NULL, 1000); linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec, NULL, 1000); linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); linux_get_machine(&linux_kplatform); linux_szplatform = roundup(strlen(linux_kplatform) + 1, sizeof(char *)); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); FEATURE(linux, "Linux 32bit support"); Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 319872) +++ head/sys/kern/init_main.c (revision 319873) @@ -1,858 +1,857 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int -null_fetch_syscall_args(struct thread *td __unused, - struct syscall_args *sa __unused) +null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_prison = &prison0; newcred->cr_loginclass = loginclass_find("default"); proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: head/sys/kern/kern_fork.c =================================================================== --- head/sys/kern/kern_fork.c (revision 319872) +++ head/sys/kern/kern_fork.c (revision 319873) @@ -1,1116 +1,1116 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(struct thread *td, struct pdfork_args *uap) { struct fork_req fr; int error, fd, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPROCDESC; fr.fr_pidp = &pid; fr.fr_pd_fd = &fd; fr.fr_pd_flags = uap->flags; /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct fork_req fr; int error, pid; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); fr.fr_flags = uap->flags; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > pid_max - 100) /* out of range */ pid = pid_max - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); static int fork_findpid(int flags) { struct proc *p; int trypid; static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ sx_assert(&allproc_lock, SX_LOCKED); sx_assert(&proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= pid_max) { trypid = trypid % pid_max; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. * * Avoid reuse of the process group id, session id or * the reaper subtree id. Note that for process group * and sessions, the amount of reserved pids is * limited by process limit. For the subtree ids, the * id is kept reserved only while there is a * non-reaped process in the subtree, so amount of * reserved pids is limited by process limit times * two. */ p = LIST_FIRST(&allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || p->p_reapsubtree == trypid || (p->p_pgrp != NULL && (p->p_pgrp->pg_id == trypid || (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p->p_pid > trypid && pidchecked > p->p_pid) pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && pidchecked > p->p_pgrp->pg_id) pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && pidchecked > p->p_session->s_sid) pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; return (trypid); } static int fork_norfproc(struct thread *td, int flags) { int error; struct proc *p1; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error) goto fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd, false); fdescfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(td); fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; int trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p1 = td->td_proc; trypid = fork_findpid(fr->fr_flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { fd = fdinit(p1->p_fd, false); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC | P2_TRAPCAP); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); /* * Whilst the proc lock is held, copy the VM domain data out * using the VM domain method. */ vm_domain_policy_init(&p2->p_vm_dom_policy); vm_domain_policy_localcopy(&p2->p_vm_dom_policy, &p1->p_vm_dom_policy); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vrefact(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { VM_CNT_INC(v_forks); VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { VM_CNT_INC(v_vforks); VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { VM_CNT_INC(v_kthreads); VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { VM_CNT_INC(v_rforks); VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif /* * Hold the process so that it cannot exit after we make it runnable, * but before we wait for the debugger. */ _PHOLD(p2); if (p1->p_ptevents & PTRACE_FORK) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; } if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } if ((fr->fr_flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; } else { *fr->fr_procp = p2; } PROC_LOCK(p2); /* * Wait until debugger is attached to child. */ while (td2->td_proc == p2 && (td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); _PRELE(p2); racct_proc_fork_done(p2); PROC_UNLOCK(p2); } int fork1(struct thread *td, struct fork_req *fr) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; static int curfail; static struct timeval lastfail; int flags, pages; flags = fr->fr_flags; pages = fr->fr_pages; if ((flags & RFSTOPPED) != 0) MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL); else MPASS(fr->fr_procp == NULL); /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (fr->fr_pd_fd == NULL) return (EINVAL); /* Check if we are using supported flags. */ if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (fr->fr_procp != NULL) *fr->fr_procp = NULL; else if (fr->fr_pidp != NULL) *fr->fr_pidp = 0; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd, fr->fr_pd_flags, fr->fr_pd_fcaps); if (error != 0) goto fail2; } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail2; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * subtracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ proc_set_cred_init(newproc, crhold(td->td_ucred)); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); sx_xlock(&allproc_lock); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC)); } if (ok) { do_fork(td, fr, newproc, td2, vm2, fp_procdesc); return (0); } error = EAGAIN; sx_sunlock(&proctree_lock); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: crfree(newproc->p_ucred); newproc->p_ucred = NULL; fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td_get_sched(td), p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_fork_kthread_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KPROC) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); td->td_pflags &= ~TDP_FORKING; } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. This function is passed in to fork_exit() * as the first parameter and is called when returning to a new * userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p, *dbg; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { sx_xlock(&proctree_lock); PROC_LOCK(p); if (p->p_pptr->p_ptevents & PTRACE_FORK) { /* * If debugger still wants auto-attach for the * parent's children, do it now. */ dbg = p->p_pptr->p_pptr; proc_set_traced(p, true); CTR2(KTR_PTRACE, "fork_return: attaching to new child pid %d: oppid %d", p->p_pid, p->p_oppid); proc_reparent(p, dbg); sx_xunlock(&proctree_lock); td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP; ptracestop(td, SIGSTOP, NULL); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ sx_xunlock(&proctree_lock); td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; - _STOPEVENT(p, S_SCX, td->td_dbg_sc_code); + _STOPEVENT(p, S_SCX, td->td_sa.code); if ((p->p_ptevents & PTRACE_SCX) != 0 || (td->td_dbgflags & TDB_BORN) != 0) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_BORN); PROC_UNLOCK(p); } userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 319872) +++ head/sys/kern/kern_thread.c (revision 319873) @@ -1,1260 +1,1260 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0xf4, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xfc, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x410, +_Static_assert(offsetof(struct thread, td_frame) == 0x460, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x4b8, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x508, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xbc, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3d0, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3e0, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x9c, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2c8, +_Static_assert(offsetof(struct thread, td_frame) == 0x2ec, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x314, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x338, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x74, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x27c, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x288, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x314, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: head/sys/kern/subr_syscall.c =================================================================== --- head/sys/kern/subr_syscall.c (revision 319872) +++ head/sys/kern/subr_syscall.c (revision 319873) @@ -1,268 +1,266 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2010 Konstantin Belousov * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include "opt_capsicum.h" #include "opt_ktrace.h" __FBSDID("$FreeBSD$"); #include #include #include #ifdef KTRACE #include #include #endif #include static inline int -syscallenter(struct thread *td, struct syscall_args *sa) +syscallenter(struct thread *td) { struct proc *p; + struct syscall_args *sa; int error, traced; VM_CNT_INC(v_syscall); p = td->td_proc; + sa = &td->td_sa; td->td_pticks = 0; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); traced = (p->p_flag & P_TRACED) != 0; if (traced || td->td_dbgflags & TDB_USERWR) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_USERWR; if (traced) td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } - error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); + error = (p->p_sysent->sv_fetch_syscall_args)(td); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0], "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]); if (error == 0) { STOPEVENT(p, S_SCE, sa->narg); if (p->p_flag & P_TRACED) { PROC_LOCK(p); - td->td_dbg_sc_code = sa->code; - td->td_dbg_sc_narg = sa->narg; if (p->p_ptevents & PTRACE_SCE) ptracestop((td), SIGTRAP, NULL); PROC_UNLOCK(p); } if (td->td_dbgflags & TDB_USERWR) { /* * Reread syscall number and arguments if * debugger modified registers or memory. */ - error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); - PROC_LOCK(p); - td->td_dbg_sc_code = sa->code; - td->td_dbg_sc_narg = sa->narg; - PROC_UNLOCK(p); + error = (p->p_sysent->sv_fetch_syscall_args)(td); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif if (error != 0) goto retval; } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if (IN_CAPABILITY_MODE(td) && !(sa->callp->sy_flags & SYF_CAPENABLED)) { error = ECAPMODE; goto retval; } #endif error = syscall_thread_enter(td, sa->callp); if (error != 0) goto retval; #ifdef KDTRACE_HOOKS /* Give the syscall:::entry DTrace probe a chance to fire. */ if (systrace_probe_func != NULL && sa->callp->sy_entry != 0) (*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0); #endif AUDIT_SYSCALL_ENTER(sa->code, td); error = (sa->callp->sy_call)(td, sa->args); AUDIT_SYSCALL_EXIT(error, td); /* Save the latest error return value. */ if ((td->td_pflags & TDP_NERRNO) == 0) td->td_errno = error; #ifdef KDTRACE_HOOKS /* Give the syscall:::return DTrace probe a chance to fire. */ if (systrace_probe_func != NULL && sa->callp->sy_return != 0) (*systrace_probe_func)(sa, SYSTRACE_RETURN, error ? -1 : td->td_retval[0]); #endif syscall_thread_exit(td, sa->callp); } retval: KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error, "retval0:%#lx", td->td_retval[0], "retval1:%#lx", td->td_retval[1]); if (traced) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); return (error); } static inline void -syscallret(struct thread *td, int error, struct syscall_args *sa) +syscallret(struct thread *td, int error) { struct proc *p, *p2; + struct syscall_args *sa; ksiginfo_t ksi; int traced, error1; KASSERT((td->td_pflags & TDP_FORKING) == 0, ("fork() did not clear TDP_FORKING upon completion")); p = td->td_proc; + sa = &td->td_sa; if ((trap_enotcap || (p->p_flag2 & P2_TRAPCAP) != 0) && IN_CAPABILITY_MODE(td)) { error1 = (td->td_pflags & TDP_NERRNO) == 0 ? error : td->td_errno; if (error1 == ENOTCAPABLE || error1 == ECAPMODE) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_errno = error1; ksi.ksi_code = TRAP_CAP; trapsignal(td, &ksi); } } /* * Handle reschedule and other end-of-syscall issues */ userret(td, td->td_frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) { ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ? error : td->td_errno, td->td_retval[0]); } #endif td->td_pflags &= ~TDP_NERRNO; if (p->p_flag & P_TRACED) { traced = 1; PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; PROC_UNLOCK(p); } else traced = 0; /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, sa->code); if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) { PROC_LOCK(p); /* * If tracing the execed process, trap to the debugger * so that breakpoints can be set before the program * executes. If debugger requested tracing of syscall * returns, do it now too. */ if (traced && ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 || (p->p_ptevents & PTRACE_SCX) != 0)) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK); PROC_UNLOCK(p); } if (td->td_pflags & TDP_RFPPWAIT) { /* * Preserve synchronization semantics of vfork. If * waiting for child to exec or exit, fork set * P_PPWAIT on child, and there we sleep on our proc * (in case of exit). * * Do it after the ptracestop() above is finished, to * not block our debugger until child execs or exits * to finish vfork wait. */ td->td_pflags &= ~TDP_RFPPWAIT; p2 = td->td_rfppwait_p; again: PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) { PROC_LOCK(p); if (thread_suspend_check_needed()) { PROC_UNLOCK(p2); thread_suspend_check(0); PROC_UNLOCK(p); goto again; } else { PROC_UNLOCK(p); } cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz); } PROC_UNLOCK(p2); if (td->td_dbgflags & TDB_VFORK) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_VFORK) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~TDB_VFORK; PROC_UNLOCK(p); } } } Index: head/sys/kern/sys_process.c =================================================================== --- head/sys/kern/sys_process.c (revision 319872) +++ head/sys/kern/sys_process.c (revision 319873) @@ -1,1460 +1,1460 @@ /*- * Copyright (c) 1994, Sean Eric Fagan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include struct ptrace_io_desc32 { int piod_op; uint32_t piod_offs; uint32_t piod_addr; uint32_t piod_len; }; struct ptrace_vm_entry32 { int pve_entry; int pve_timestamp; uint32_t pve_start; uint32_t pve_end; uint32_t pve_offset; u_int pve_prot; u_int pve_pathlen; int32_t pve_fileid; u_int pve_fsid; uint32_t pve_path; }; struct ptrace_lwpinfo32 { lwpid_t pl_lwpid; /* LWP described. */ int pl_event; /* Event that stopped the LWP. */ int pl_flags; /* LWP flags. */ sigset_t pl_sigmask; /* LWP signal mask */ sigset_t pl_siglist; /* LWP pending signal */ struct siginfo32 pl_siginfo; /* siginfo for signal */ char pl_tdname[MAXCOMLEN + 1]; /* LWP name. */ pid_t pl_child_pid; /* New child pid */ u_int pl_syscall_code; u_int pl_syscall_narg; }; #endif /* * Functions implemented using PROC_ACTION(): * * proc_read_regs(proc, regs) * Get the current user-visible register set from the process * and copy it into the regs structure (). * The process is stopped at the time read_regs is called. * * proc_write_regs(proc, regs) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * Depending on the architecture this may have fix-up work to do, * especially if the IAR or PCW are modified. * The process is stopped at the time write_regs is called. * * proc_read_fpregs, proc_write_fpregs * deal with the floating point register set, otherwise as above. * * proc_read_dbregs, proc_write_dbregs * deal with the processor debug register set, otherwise as above. * * proc_sstep(proc) * Arrange for the process to trap after executing a single instruction. */ #define PROC_ACTION(action) do { \ int error; \ \ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \ if ((td->td_proc->p_flag & P_INMEM) == 0) \ error = EIO; \ else \ error = (action); \ return (error); \ } while(0) int proc_read_regs(struct thread *td, struct reg *regs) { PROC_ACTION(fill_regs(td, regs)); } int proc_write_regs(struct thread *td, struct reg *regs) { PROC_ACTION(set_regs(td, regs)); } int proc_read_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(fill_dbregs(td, dbregs)); } int proc_write_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(set_dbregs(td, dbregs)); } /* * Ptrace doesn't support fpregs at all, and there are no security holes * or translations for fpregs, so we can just copy them. */ int proc_read_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(fill_fpregs(td, fpregs)); } int proc_write_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(set_fpregs(td, fpregs)); } #ifdef COMPAT_FREEBSD32 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */ int proc_read_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(fill_regs32(td, regs32)); } int proc_write_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(set_regs32(td, regs32)); } int proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(fill_dbregs32(td, dbregs32)); } int proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(set_dbregs32(td, dbregs32)); } int proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(fill_fpregs32(td, fpregs32)); } int proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(set_fpregs32(td, fpregs32)); } #endif int proc_sstep(struct thread *td) { PROC_ACTION(ptrace_single_step(td)); } int proc_rwmem(struct proc *p, struct uio *uio) { vm_map_t map; vm_offset_t pageno; /* page number */ vm_prot_t reqprot; int error, fault_flags, page_offset, writing; /* * Assert that someone has locked this vmspace. (Should be * curthread but we can't assert that.) This keeps the process * from exiting out from under us until this operation completes. */ PROC_ASSERT_HELD(p); PROC_LOCK_ASSERT(p, MA_NOTOWNED); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * If we are writing, then we request vm_fault() to create a private * copy of each page. Since these copies will not be writeable by the * process, we must explicity request that they be dirtied. */ writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ; fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_offset_t uva; u_int len; vm_page_t m; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault and hold the page on behalf of the process. */ error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); if (error != KERN_SUCCESS) { if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EFAULT; break; } /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); /* Make the I-cache coherent for breakpoints. */ if (writing && error == 0) { vm_map_lock_read(map); if (vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_EXECUTE)) vm_sync_icache(map, uva, len); vm_map_unlock_read(map); } /* * Release the page. */ vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } while (error == 0 && uio->uio_resid > 0); return (error); } static ssize_t proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len, enum uio_rw rw) { struct iovec iov; struct uio uio; ssize_t slen; int error; MPASS(len < SSIZE_MAX); slen = (ssize_t)len; iov.iov_base = (caddr_t)buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = va; uio.uio_resid = slen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = rw; uio.uio_td = td; error = proc_rwmem(p, &uio); if (uio.uio_resid == slen) return (-1); return (slen - uio.uio_resid); } ssize_t proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_READ)); } ssize_t proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_WRITE)); } static int ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve) { struct vattr vattr; vm_map_t map; vm_map_entry_t entry; vm_object_t obj, tobj, lobj; struct vmspace *vm; struct vnode *vp; char *freepath, *fullpath; u_int pathlen; int error, index; error = 0; obj = NULL; vm = vmspace_acquire_ref(p); map = &vm->vm_map; vm_map_lock_read(map); do { entry = map->header.next; index = 0; while (index < pve->pve_entry && entry != &map->header) { entry = entry->next; index++; } if (index != pve->pve_entry) { error = EINVAL; break; } while (entry != &map->header && (entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { entry = entry->next; index++; } if (entry == &map->header) { error = ENOENT; break; } /* We got an entry. */ pve->pve_entry = index + 1; pve->pve_timestamp = map->timestamp; pve->pve_start = entry->start; pve->pve_end = entry->end - 1; pve->pve_offset = entry->offset; pve->pve_prot = entry->protection; /* Backing object's path needed? */ if (pve->pve_pathlen == 0) break; pathlen = pve->pve_pathlen; pve->pve_pathlen = 0; obj = entry->object.vm_object; if (obj != NULL) VM_OBJECT_RLOCK(obj); } while (0); vm_map_unlock_read(map); pve->pve_fsid = VNOVAL; pve->pve_fileid = VNOVAL; if (error == 0 && obj != NULL) { lobj = obj; for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; pve->pve_offset += tobj->backing_object_offset; } vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { freepath = NULL; fullpath = NULL; vn_fullpath(td, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) { pve->pve_fileid = vattr.va_fileid; pve->pve_fsid = vattr.va_fsid; } vput(vp); if (fullpath != NULL) { pve->pve_pathlen = strlen(fullpath) + 1; if (pve->pve_pathlen <= pathlen) { error = copyout(fullpath, pve->pve_path, pve->pve_pathlen); } else error = ENAMETOOLONG; } if (freepath != NULL) free(freepath, M_TEMP); } } vmspace_free(vm); if (error == 0) CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p", p->p_pid, pve->pve_entry, pve->pve_start); return (error); } #ifdef COMPAT_FREEBSD32 static int ptrace_vm_entry32(struct thread *td, struct proc *p, struct ptrace_vm_entry32 *pve32) { struct ptrace_vm_entry pve; int error; pve.pve_entry = pve32->pve_entry; pve.pve_pathlen = pve32->pve_pathlen; pve.pve_path = (void *)(uintptr_t)pve32->pve_path; error = ptrace_vm_entry(td, p, &pve); if (error == 0) { pve32->pve_entry = pve.pve_entry; pve32->pve_timestamp = pve.pve_timestamp; pve32->pve_start = pve.pve_start; pve32->pve_end = pve.pve_end; pve32->pve_offset = pve.pve_offset; pve32->pve_prot = pve.pve_prot; pve32->pve_fileid = pve.pve_fileid; pve32->pve_fsid = pve.pve_fsid; } pve32->pve_pathlen = pve.pve_pathlen; return (error); } static void ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl, struct ptrace_lwpinfo32 *pl32) { pl32->pl_lwpid = pl->pl_lwpid; pl32->pl_event = pl->pl_event; pl32->pl_flags = pl->pl_flags; pl32->pl_sigmask = pl->pl_sigmask; pl32->pl_siglist = pl->pl_siglist; siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo); strcpy(pl32->pl_tdname, pl->pl_tdname); pl32->pl_child_pid = pl->pl_child_pid; pl32->pl_syscall_code = pl->pl_syscall_code; pl32->pl_syscall_narg = pl->pl_syscall_narg; } #endif /* COMPAT_FREEBSD32 */ /* * Process debugging system call. */ #ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; #endif #ifdef COMPAT_FREEBSD32 /* * This CPP subterfuge is to try and reduce the number of ifdefs in * the body of the code. * COPYIN(uap->addr, &r.reg, sizeof r.reg); * becomes either: * copyin(uap->addr, &r.reg, sizeof r.reg); * or * copyin(uap->addr, &r.reg32, sizeof r.reg32); * .. except this is done at runtime. */ #define COPYIN(u, k, s) wrap32 ? \ copyin(u, k ## 32, s ## 32) : \ copyin(u, k, s) #define COPYOUT(k, u, s) wrap32 ? \ copyout(k ## 32, u, s ## 32) : \ copyout(k, u, s) #else #define COPYIN(u, k, s) copyin(u, k, s) #define COPYOUT(k, u, s) copyout(k, u, s) #endif int sys_ptrace(struct thread *td, struct ptrace_args *uap) { /* * XXX this obfuscation is to reduce stack usage, but the register * structs may be too large to put on the stack anyway. */ union { struct ptrace_io_desc piod; struct ptrace_lwpinfo pl; struct ptrace_vm_entry pve; struct dbreg dbreg; struct fpreg fpreg; struct reg reg; #ifdef COMPAT_FREEBSD32 struct dbreg32 dbreg32; struct fpreg32 fpreg32; struct reg32 reg32; struct ptrace_io_desc32 piod32; struct ptrace_lwpinfo32 pl32; struct ptrace_vm_entry32 pve32; #endif int ptevents; } r; void *addr; int error = 0; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; if (SV_CURPROC_FLAG(SV_ILP32)) wrap32 = 1; #endif AUDIT_ARG_PID(uap->pid); AUDIT_ARG_CMD(uap->req); AUDIT_ARG_VALUE(uap->data); addr = &r; switch (uap->req) { case PT_GET_EVENT_MASK: case PT_GETREGS: case PT_GETFPREGS: case PT_GETDBREGS: case PT_LWPINFO: break; case PT_SETREGS: error = COPYIN(uap->addr, &r.reg, sizeof r.reg); break; case PT_SETFPREGS: error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg); break; case PT_SETDBREGS: error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg); break; case PT_SET_EVENT_MASK: if (uap->data != sizeof(r.ptevents)) error = EINVAL; else error = copyin(uap->addr, &r.ptevents, uap->data); break; case PT_IO: error = COPYIN(uap->addr, &r.piod, sizeof r.piod); break; case PT_VM_ENTRY: error = COPYIN(uap->addr, &r.pve, sizeof r.pve); break; default: addr = uap->addr; break; } if (error) return (error); error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data); if (error) return (error); switch (uap->req) { case PT_VM_ENTRY: error = COPYOUT(&r.pve, uap->addr, sizeof r.pve); break; case PT_IO: error = COPYOUT(&r.piod, uap->addr, sizeof r.piod); break; case PT_GETREGS: error = COPYOUT(&r.reg, uap->addr, sizeof r.reg); break; case PT_GETFPREGS: error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg); break; case PT_GETDBREGS: error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg); break; case PT_GET_EVENT_MASK: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.ptevents, uap->addr, uap->data); break; case PT_LWPINFO: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.pl, uap->addr, uap->data); break; } return (error); } #undef COPYIN #undef COPYOUT #ifdef COMPAT_FREEBSD32 /* * PROC_READ(regs, td2, addr); * becomes either: * proc_read_regs(td2, addr); * or * proc_read_regs32(td2, addr); * .. except this is done at runtime. There is an additional * complication in that PROC_WRITE disallows 32 bit consumers * from writing to 64 bit address space targets. */ #define PROC_READ(w, t, a) wrap32 ? \ proc_read_ ## w ## 32(t, a) : \ proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) wrap32 ? \ (safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \ proc_write_ ## w (t, a) #else #define PROC_READ(w, t, a) proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) proc_write_ ## w (t, a) #endif void proc_set_traced(struct proc *p, bool stop) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) p->p_flag2 |= P2_PTRACE_FSTP; p->p_ptevents = PTRACE_DEFAULT; p->p_oppid = p->p_pptr->p_pid; } int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) { struct iovec iov; struct uio uio; struct proc *curp, *p, *pp; struct thread *td2 = NULL, *td3; struct ptrace_io_desc *piod = NULL; struct ptrace_lwpinfo *pl; int error, num, tmp; int proctree_locked = 0; lwpid_t tid = 0, *buf; #ifdef COMPAT_FREEBSD32 int wrap32 = 0, safe = 0; struct ptrace_io_desc32 *piod32 = NULL; struct ptrace_lwpinfo32 *pl32 = NULL; struct ptrace_lwpinfo plr; #endif curp = td->td_proc; /* Lock proctree before locking the process. */ switch (req) { case PT_TRACE_ME: case PT_ATTACH: case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_FOLLOW_FORK: case PT_LWP_EVENTS: case PT_GET_EVENT_MASK: case PT_SET_EVENT_MASK: case PT_DETACH: sx_xlock(&proctree_lock); proctree_locked = 1; break; default: break; } if (req == PT_TRACE_ME) { p = td->td_proc; PROC_LOCK(p); } else { if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } p = td2->td_proc; tid = pid; pid = p->p_pid; } } AUDIT_ARG_PROCESS(p); if ((p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto fail; } if ((error = p_cansee(td, p)) != 0) goto fail; if ((error = p_candebug(td, p)) != 0) goto fail; /* * System processes can't be debugged. */ if ((p->p_flag & P_SYSTEM) != 0) { error = EINVAL; goto fail; } if (tid == 0) { if ((p->p_flag & P_STOPPED_TRACE) != 0) { KASSERT(p->p_xthread != NULL, ("NULL p_xthread")); td2 = p->p_xthread; } else { td2 = FIRST_THREAD_IN_PROC(p); } tid = td2->td_tid; } #ifdef COMPAT_FREEBSD32 /* * Test if we're a 32 bit client and what the target is. * Set the wrap controls accordingly. */ if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32)) safe = 1; wrap32 = 1; } #endif /* * Permissions check */ switch (req) { case PT_TRACE_ME: /* * Always legal, when there is a parent process which * could trace us. Otherwise, reject. */ if ((p->p_flag & P_TRACED) != 0) { error = EBUSY; goto fail; } if (p->p_pptr == initproc) { error = EPERM; goto fail; } break; case PT_ATTACH: /* Self */ if (p == td->td_proc) { error = EINVAL; goto fail; } /* Already traced */ if (p->p_flag & P_TRACED) { error = EBUSY; goto fail; } /* Can't trace an ancestor if you're being traced. */ if (curp->p_flag & P_TRACED) { for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) { if (pp == p) { error = EINVAL; goto fail; } } } /* OK */ break; case PT_CLEARSTEP: /* Allow thread to clear single step for itself */ if (td->td_tid == tid) break; /* FALLTHROUGH */ default: /* not being traced... */ if ((p->p_flag & P_TRACED) == 0) { error = EPERM; goto fail; } /* not being traced by YOU */ if (p->p_pptr != td->td_proc) { error = EBUSY; goto fail; } /* not currently stopped */ if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) == 0 || p->p_suspcount != p->p_numthreads || (p->p_flag & P_WAITED) == 0) { error = EBUSY; goto fail; } if ((p->p_flag & P_STOPPED_TRACE) == 0) { static int count = 0; if (count++ == 0) printf("P_STOPPED_TRACE not set.\n"); } /* OK */ break; } /* Keep this process around until we finish this request. */ _PHOLD(p); #ifdef FIX_SSTEP /* * Single step fixup ala procfs */ FIX_SSTEP(td2); #endif /* * Actually do the requests */ td->td_retval[0] = 0; switch (req) { case PT_TRACE_ME: /* set my trace flag and "owner" so it can read/write me */ proc_set_traced(p, false); if (p->p_flag & P_PPWAIT) p->p_flag |= P_PPTRACE; CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid); break; case PT_ATTACH: /* security check done above */ /* * It would be nice if the tracing relationship was separate * from the parent relationship but that would require * another set of links in the proc struct or for "wait" * to scan the entire proc table. To make life easier, * we just re-parent the process we're trying to trace. * The old parent is remembered so we can put things back * on a "detach". */ proc_set_traced(p, true); if (p->p_pptr != td->td_proc) { proc_reparent(p, td->td_proc); } data = SIGSTOP; CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); goto sendsig; /* in PT_CONTINUE below */ case PT_CLEARSTEP: CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_clear_single_step(td2); break; case PT_SETSTEP: CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_single_step(td2); break; case PT_SUSPEND: CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_SUSPEND; thread_lock(td2); td2->td_flags |= TDF_NEEDSUSPCHK; thread_unlock(td2); break; case PT_RESUME: CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags &= ~TDB_SUSPEND; break; case PT_FOLLOW_FORK: CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_FORK; else p->p_ptevents &= ~PTRACE_FORK; break; case PT_LWP_EVENTS: CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_LWP; else p->p_ptevents &= ~PTRACE_LWP; break; case PT_GET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid, p->p_ptevents); *(int *)addr = p->p_ptevents; break; case PT_SET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } tmp = *(int *)addr; if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX | PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) { error = EINVAL; break; } CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x", p->p_pid, p->p_ptevents, tmp); p->p_ptevents = tmp; break; case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_DETACH: /* Zero means do not send any signal */ if (data < 0 || data > _SIG_MAXSIG) { error = EINVAL; break; } switch (req) { case PT_STEP: CTR2(KTR_PTRACE, "PT_STEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_single_step(td2); if (error) goto out; break; case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: if (addr != (void *)1) { error = ptrace_set_pc(td2, (u_long)(uintfptr_t)addr); if (error) goto out; } switch (req) { case PT_TO_SCE: p->p_ptevents |= PTRACE_SCE; CTR4(KTR_PTRACE, "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_TO_SCX: p->p_ptevents |= PTRACE_SCX; CTR4(KTR_PTRACE, "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_SYSCALL: p->p_ptevents |= PTRACE_SYSCALL; CTR4(KTR_PTRACE, "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_CONTINUE: CTR3(KTR_PTRACE, "PT_CONTINUE: pid %d, PC = %#lx, sig = %d", p->p_pid, (u_long)(uintfptr_t)addr, data); break; } break; case PT_DETACH: /* * Reset the process parent. * * NB: This clears P_TRACED before reparenting * a detached process back to its original * parent. Otherwise the debugee will be set * as an orphan of the debugger. */ p->p_flag &= ~(P_TRACED | P_WAITED); if (p->p_oppid != p->p_pptr->p_pid) { PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); pp = proc_realparent(p); proc_reparent(p, pp); if (pp == initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", p->p_pid, pp->p_pid, data); } else CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d", p->p_pid, data); p->p_oppid = 0; p->p_ptevents = 0; FOREACH_THREAD_IN_PROC(p, td3) { if ((td3->td_dbgflags & TDB_FSTP) != 0) { sigqueue_delete(&td3->td_sigqueue, SIGSTOP); } td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP); } if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) { sigqueue_delete(&p->p_sigqueue, SIGSTOP); p->p_flag2 &= ~P2_PTRACE_FSTP; } /* should we send SIGCHLD? */ /* childproc_continued(p); */ break; } sendsig: if (proctree_locked) { sx_xunlock(&proctree_lock); proctree_locked = 0; } p->p_xsig = data; p->p_xthread = NULL; if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) { /* deliver or queue signal */ td2->td_dbgflags &= ~TDB_XSIG; td2->td_xsig = data; /* * P_WKILLED is insurance that a PT_KILL/SIGKILL always * works immediately, even if another thread is * unsuspended first and attempts to handle a different * signal or if the POSIX.1b style signal queue cannot * accommodate any new signals. */ if (data == SIGKILL) p->p_flag |= P_WKILLED; if (req == PT_DETACH) { FOREACH_THREAD_IN_PROC(p, td3) td3->td_dbgflags &= ~TDB_SUSPEND; } /* * unsuspend all threads, to not let a thread run, * you should use PT_SUSPEND to suspend it before * continuing process. */ PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); if (req == PT_ATTACH) kern_psignal(p, data); } else { if (data) kern_psignal(p, data); } break; case PT_WRITE_I: case PT_WRITE_D: td2->td_dbgflags |= TDB_USERWR; PROC_UNLOCK(p); error = 0; if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x", p->p_pid, addr, data); PROC_LOCK(p); break; case PT_READ_I: case PT_READ_D: PROC_UNLOCK(p); error = tmp = 0; if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x", p->p_pid, addr, tmp); td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_IO: #ifdef COMPAT_FREEBSD32 if (wrap32) { piod32 = addr; iov.iov_base = (void *)(uintptr_t)piod32->piod_addr; iov.iov_len = piod32->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs; uio.uio_resid = piod32->piod_len; } else #endif { piod = addr; iov.iov_base = piod->piod_addr; iov.iov_len = piod->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs; uio.uio_resid = piod->piod_len; } uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; #ifdef COMPAT_FREEBSD32 tmp = wrap32 ? piod32->piod_op : piod->piod_op; #else tmp = piod->piod_op; #endif switch (tmp) { case PIOD_READ_D: case PIOD_READ_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); uio.uio_rw = UIO_READ; break; case PIOD_WRITE_D: case PIOD_WRITE_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); td2->td_dbgflags |= TDB_USERWR; uio.uio_rw = UIO_WRITE; break; default: error = EINVAL; goto out; } PROC_UNLOCK(p); error = proc_rwmem(p, &uio); #ifdef COMPAT_FREEBSD32 if (wrap32) piod32->piod_len -= uio.uio_resid; else #endif piod->piod_len -= uio.uio_resid; PROC_LOCK(p); break; case PT_KILL: CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid); data = SIGKILL; goto sendsig; /* in PT_CONTINUE above */ case PT_SETREGS: CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(regs, td2, addr); break; case PT_GETREGS: CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(regs, td2, addr); break; case PT_SETFPREGS: CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(fpregs, td2, addr); break; case PT_GETFPREGS: CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(fpregs, td2, addr); break; case PT_SETDBREGS: CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(dbregs, td2, addr); break; case PT_GETDBREGS: CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(dbregs, td2, addr); break; case PT_LWPINFO: if (data <= 0 || #ifdef COMPAT_FREEBSD32 (!wrap32 && data > sizeof(*pl)) || (wrap32 && data > sizeof(*pl32))) { #else data > sizeof(*pl)) { #endif error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { pl = &plr; pl32 = addr; } else #endif pl = addr; pl->pl_lwpid = td2->td_tid; pl->pl_event = PL_EVENT_NONE; pl->pl_flags = 0; if (td2->td_dbgflags & TDB_XSIG) { pl->pl_event = PL_EVENT_SIGNAL; if (td2->td_si.si_signo != 0 && #ifdef COMPAT_FREEBSD32 ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo)) || (wrap32 && data >= offsetof(struct ptrace_lwpinfo32, pl_siginfo) + sizeof(struct siginfo32))) #else data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo) #endif ){ pl->pl_flags |= PL_FLAG_SI; pl->pl_siginfo = td2->td_si; } } if ((pl->pl_flags & PL_FLAG_SI) == 0) bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo)); if (td2->td_dbgflags & TDB_SCE) pl->pl_flags |= PL_FLAG_SCE; else if (td2->td_dbgflags & TDB_SCX) pl->pl_flags |= PL_FLAG_SCX; if (td2->td_dbgflags & TDB_EXEC) pl->pl_flags |= PL_FLAG_EXEC; if (td2->td_dbgflags & TDB_FORK) { pl->pl_flags |= PL_FLAG_FORKED; pl->pl_child_pid = td2->td_dbg_forked; if (td2->td_dbgflags & TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORKED; } else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) == TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORK_DONE; if (td2->td_dbgflags & TDB_CHILD) pl->pl_flags |= PL_FLAG_CHILD; if (td2->td_dbgflags & TDB_BORN) pl->pl_flags |= PL_FLAG_BORN; if (td2->td_dbgflags & TDB_EXIT) pl->pl_flags |= PL_FLAG_EXITED; pl->pl_sigmask = td2->td_sigmask; pl->pl_siglist = td2->td_siglist; strcpy(pl->pl_tdname, td2->td_name); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) { - pl->pl_syscall_code = td2->td_dbg_sc_code; - pl->pl_syscall_narg = td2->td_dbg_sc_narg; + pl->pl_syscall_code = td2->td_sa.code; + pl->pl_syscall_narg = td2->td_sa.narg; } else { pl->pl_syscall_code = 0; pl->pl_syscall_narg = 0; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_lwpinfo_to32(pl, pl32); #endif CTR6(KTR_PTRACE, "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d", td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags, pl->pl_child_pid, pl->pl_syscall_code); break; case PT_GETNUMLWPS: CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid, p->p_numthreads); td->td_retval[0] = p->p_numthreads; break; case PT_GETLWPLIST: CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d", p->p_pid, data, p->p_numthreads); if (data <= 0) { error = EINVAL; break; } num = imin(p->p_numthreads, data); PROC_UNLOCK(p); buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); if (!error) td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_VM_TIMESTAMP: CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d", p->p_pid, p->p_vmspace->vm_map.timestamp); td->td_retval[0] = p->p_vmspace->vm_map.timestamp; break; case PT_VM_ENTRY: PROC_UNLOCK(p); #ifdef COMPAT_FREEBSD32 if (wrap32) error = ptrace_vm_entry32(td, p, addr); else #endif error = ptrace_vm_entry(td, p, addr); PROC_LOCK(p); break; default: #ifdef __HAVE_PTRACE_MACHDEP if (req >= PT_FIRSTMACH) { PROC_UNLOCK(p); error = cpu_ptrace(td2, req, addr, data); PROC_LOCK(p); } else #endif /* Unknown request. */ error = EINVAL; break; } out: /* Drop our hold on this process now that the request has completed. */ _PRELE(p); fail: PROC_UNLOCK(p); if (proctree_locked) sx_xunlock(&proctree_lock); return (error); } #undef PROC_READ #undef PROC_WRITE /* * Stop a process because of a debugging event; * stay stopped until p->p_step is cleared * (cleared by PIOCCONT in procfs). */ void stopevent(struct proc *p, unsigned int event, unsigned int val) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_step = 1; CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event, val); do { if (event != S_EXIT) p->p_xsig = val; p->p_xthread = NULL; p->p_stype = event; /* Which event caused the stop? */ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0); } while (p->p_step); } Index: head/sys/mips/mips/trap.c =================================================================== --- head/sys/mips/mips/trap.c (revision 319872) +++ head/sys/mips/mips/trap.c (revision 319873) @@ -1,1709 +1,1712 @@ /* $OpenBSD: trap.c,v 1.19 1998/09/30 12:40:41 pefo Exp $ */ /* tracked to 1.23 */ /*- * Copyright (c) 1988 University of Utah. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: trap.c 1.32 91/04/06 * * from: @(#)trap.c 8.5 (Berkeley) 1/11/94 * JNPR: trap.c,v 1.13.2.2 2007/08/29 10:03:49 girish */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #include #endif #ifdef KDTRACE_HOOKS #include #endif #ifdef TRAP_DEBUG int trap_debug = 0; SYSCTL_INT(_machdep, OID_AUTO, trap_debug, CTLFLAG_RW, &trap_debug, 0, "Debug information on all traps"); #endif #define lbu_macro(data, addr) \ __asm __volatile ("lbu %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lb_macro(data, addr) \ __asm __volatile ("lb %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lwl_macro(data, addr) \ __asm __volatile ("lwl %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lwr_macro(data, addr) \ __asm __volatile ("lwr %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define ldl_macro(data, addr) \ __asm __volatile ("ldl %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define ldr_macro(data, addr) \ __asm __volatile ("ldr %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define sb_macro(data, addr) \ __asm __volatile ("sb %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define swl_macro(data, addr) \ __asm __volatile ("swl %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define swr_macro(data, addr) \ __asm __volatile ("swr %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define sdl_macro(data, addr) \ __asm __volatile ("sdl %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define sdr_macro(data, addr) \ __asm __volatile ("sdr %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ static void log_illegal_instruction(const char *, struct trapframe *); static void log_bad_page_fault(char *, struct trapframe *, int); static void log_frame_dump(struct trapframe *frame); static void get_mapping_info(vm_offset_t, pd_entry_t **, pt_entry_t **); int (*dtrace_invop_jump_addr)(struct trapframe *); #ifdef TRAP_DEBUG static void trap_frame_dump(struct trapframe *frame); #endif void (*machExceptionTable[]) (void)= { /* * The kernel exception handlers. */ MipsKernIntr, /* external interrupt */ MipsKernGenException, /* TLB modification */ MipsTLBInvalidException,/* TLB miss (load or instr. fetch) */ MipsTLBInvalidException,/* TLB miss (store) */ MipsKernGenException, /* address error (load or I-fetch) */ MipsKernGenException, /* address error (store) */ MipsKernGenException, /* bus error (I-fetch) */ MipsKernGenException, /* bus error (load or store) */ MipsKernGenException, /* system call */ MipsKernGenException, /* breakpoint */ MipsKernGenException, /* reserved instruction */ MipsKernGenException, /* coprocessor unusable */ MipsKernGenException, /* arithmetic overflow */ MipsKernGenException, /* trap exception */ MipsKernGenException, /* virtual coherence exception inst */ MipsKernGenException, /* floating point exception */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* watch exception */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* virtual coherence exception data */ /* * The user exception handlers. */ MipsUserIntr, /* 0 */ MipsUserGenException, /* 1 */ MipsTLBInvalidException,/* 2 */ MipsTLBInvalidException,/* 3 */ MipsUserGenException, /* 4 */ MipsUserGenException, /* 5 */ MipsUserGenException, /* 6 */ MipsUserGenException, /* 7 */ MipsUserGenException, /* 8 */ MipsUserGenException, /* 9 */ MipsUserGenException, /* 10 */ MipsUserGenException, /* 11 */ MipsUserGenException, /* 12 */ MipsUserGenException, /* 13 */ MipsUserGenException, /* 14 */ MipsUserGenException, /* 15 */ MipsUserGenException, /* 16 */ MipsUserGenException, /* 17 */ MipsUserGenException, /* 18 */ MipsUserGenException, /* 19 */ MipsUserGenException, /* 20 */ MipsUserGenException, /* 21 */ MipsUserGenException, /* 22 */ MipsUserGenException, /* 23 */ MipsUserGenException, /* 24 */ MipsUserGenException, /* 25 */ MipsUserGenException, /* 26 */ MipsUserGenException, /* 27 */ MipsUserGenException, /* 28 */ MipsUserGenException, /* 29 */ MipsUserGenException, /* 20 */ MipsUserGenException, /* 31 */ }; char *trap_type[] = { "external interrupt", "TLB modification", "TLB miss (load or instr. fetch)", "TLB miss (store)", "address error (load or I-fetch)", "address error (store)", "bus error (I-fetch)", "bus error (load or store)", "system call", "breakpoint", "reserved instruction", "coprocessor unusable", "arithmetic overflow", "trap", "virtual coherency instruction", "floating point", "reserved 16", "reserved 17", "reserved 18", "reserved 19", "reserved 20", "reserved 21", "reserved 22", "watch", "reserved 24", "reserved 25", "reserved 26", "reserved 27", "reserved 28", "reserved 29", "reserved 30", "virtual coherency data", }; #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) struct trapdebug trapdebug[TRAPSIZE], *trp = trapdebug; #endif #define KERNLAND(x) ((vm_offset_t)(x) >= VM_MIN_KERNEL_ADDRESS && (vm_offset_t)(x) < VM_MAX_KERNEL_ADDRESS) #define DELAYBRANCH(x) ((int)(x) < 0) /* * MIPS load/store access type */ enum { MIPS_LHU_ACCESS = 1, MIPS_LH_ACCESS, MIPS_LWU_ACCESS, MIPS_LW_ACCESS, MIPS_LD_ACCESS, MIPS_SH_ACCESS, MIPS_SW_ACCESS, MIPS_SD_ACCESS }; char *access_name[] = { "Load Halfword Unsigned", "Load Halfword", "Load Word Unsigned", "Load Word", "Load Doubleword", "Store Halfword", "Store Word", "Store Doubleword" }; #ifdef CPU_CNMIPS #include #endif static int allow_unaligned_acc = 1; SYSCTL_INT(_vm, OID_AUTO, allow_unaligned_acc, CTLFLAG_RW, &allow_unaligned_acc, 0, "Allow unaligned accesses"); /* * FP emulation is assumed to work on O32, but the code is outdated and crufty * enough that it's a more sensible default to have it disabled when using * other ABIs. At the very least, it needs a lot of help in using * type-semantic ABI-oblivious macros for everything it does. */ #if defined(__mips_o32) static int emulate_fp = 1; #else static int emulate_fp = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, emulate_fp, CTLFLAG_RW, &emulate_fp, 0, "Emulate unimplemented FPU instructions"); static int emulate_unaligned_access(struct trapframe *frame, int mode); extern void fswintrberr(void); /* XXX */ int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { - struct trapframe *locr0 = td->td_frame; + struct trapframe *locr0; struct sysentvec *se; + struct syscall_args *sa; int error, nsaved; + locr0 = td->td_frame; + sa = &td->td_sa; + bzero(sa->args, sizeof(sa->args)); /* compute next PC after syscall instruction */ td->td_pcb->pcb_tpc = sa->trapframe->pc; /* Remember if restart */ if (DELAYBRANCH(sa->trapframe->cause)) /* Check BD bit */ locr0->pc = MipsEmulateBranch(locr0, sa->trapframe->pc, 0, 0); else locr0->pc += sizeof(int); sa->code = locr0->v0; switch (sa->code) { case SYS___syscall: case SYS_syscall: /* * This is an indirect syscall, in which the code is the first argument. */ #if (!defined(__mips_n32) && !defined(__mips_n64)) || defined(COMPAT_FREEBSD32) if (sa->code == SYS___syscall && SV_PROC_FLAG(td->td_proc, SV_ILP32)) { /* * Like syscall, but code is a quad, so as to maintain alignment * for the rest of the arguments. */ if (_QUAD_LOWWORD == 0) sa->code = locr0->a0; else sa->code = locr0->a1; sa->args[0] = locr0->a2; sa->args[1] = locr0->a3; nsaved = 2; break; } #endif /* * This is either not a quad syscall, or is a quad syscall with a * new ABI in which quads fit in a single register. */ sa->code = locr0->a0; sa->args[0] = locr0->a1; sa->args[1] = locr0->a2; sa->args[2] = locr0->a3; nsaved = 3; #if defined(__mips_n32) || defined(__mips_n64) #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) { #endif /* * Non-o32 ABIs support more arguments in registers. */ sa->args[3] = locr0->a4; sa->args[4] = locr0->a5; sa->args[5] = locr0->a6; sa->args[6] = locr0->a7; nsaved += 4; #ifdef COMPAT_FREEBSD32 } #endif #endif break; default: /* * A direct syscall, arguments are just parameters to the syscall. */ sa->args[0] = locr0->a0; sa->args[1] = locr0->a1; sa->args[2] = locr0->a2; sa->args[3] = locr0->a3; nsaved = 4; #if defined (__mips_n32) || defined(__mips_n64) #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) { #endif /* * Non-o32 ABIs support more arguments in registers. */ sa->args[4] = locr0->a4; sa->args[5] = locr0->a5; sa->args[6] = locr0->a6; sa->args[7] = locr0->a7; nsaved += 4; #ifdef COMPAT_FREEBSD32 } #endif #endif break; } #ifdef TRAP_DEBUG if (trap_debug) printf("SYSCALL #%d pid:%u\n", sa->code, td->td_proc->p_pid); #endif se = td->td_proc->p_sysent; /* * XXX * Shouldn't this go before switching on the code? */ if (se->sv_mask) sa->code &= se->sv_mask; if (sa->code >= se->sv_size) sa->callp = &se->sv_table[0]; else sa->callp = &se->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (sa->narg > nsaved) { #if defined(__mips_n32) || defined(__mips_n64) /* * XXX * Is this right for new ABIs? I think the 4 there * should be 8, size there are 8 registers to skip, * not 4, but I'm not certain. */ #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) #endif printf("SYSCALL #%u pid:%u, narg (%u) > nsaved (%u).\n", sa->code, td->td_proc->p_pid, sa->narg, nsaved); #endif #if (defined(__mips_n32) || defined(__mips_n64)) && defined(COMPAT_FREEBSD32) if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { unsigned i; int32_t arg; error = 0; /* XXX GCC is awful. */ for (i = nsaved; i < sa->narg; i++) { error = copyin((caddr_t)(intptr_t)(locr0->sp + (4 + (i - nsaved)) * sizeof(int32_t)), (caddr_t)&arg, sizeof arg); if (error != 0) break; sa->args[i] = arg; } } else #endif error = copyin((caddr_t)(intptr_t)(locr0->sp + 4 * sizeof(register_t)), (caddr_t)&sa->args[nsaved], (u_int)(sa->narg - nsaved) * sizeof(register_t)); if (error != 0) { locr0->v0 = error; locr0->a3 = 1; } } else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = locr0->v1; } return (error); } #undef __FBSDID #define __FBSDID(x) #include "../../kern/subr_syscall.c" /* * Handle an exception. * Called from MipsKernGenException() or MipsUserGenException() * when a processor trap occurs. * In the case of a kernel trap, we return the pc where to resume if * p->p_addr->u_pcb.pcb_onfault is set, otherwise, return old pc. */ register_t trap(struct trapframe *trapframe) { int type, usermode; int i = 0; unsigned ucode = 0; struct thread *td = curthread; struct proc *p = curproc; vm_prot_t ftype; pmap_t pmap; int access_type; ksiginfo_t ksi; char *msg = NULL; intptr_t addr = 0; register_t pc; int cop; register_t *frame_regs; trapdebug_enter(trapframe, 0); type = (trapframe->cause & MIPS_CR_EXC_CODE) >> MIPS_CR_EXC_CODE_SHIFT; if (TRAPF_USERMODE(trapframe)) { type |= T_USER; usermode = 1; } else { usermode = 0; } /* * Enable hardware interrupts if they were on before the trap. If it * was off disable all so we don't accidently enable it when doing a * return to userland. */ if (trapframe->sr & MIPS_SR_INT_IE) { set_intr_mask(trapframe->sr & MIPS_SR_INT_MASK); intr_enable(); } else { intr_disable(); } #ifdef TRAP_DEBUG if (trap_debug) { static vm_offset_t last_badvaddr = 0; static vm_offset_t this_badvaddr = 0; static int count = 0; u_int32_t pid; printf("trap type %x (%s - ", type, trap_type[type & (~T_USER)]); if (type & T_USER) printf("user mode)\n"); else printf("kernel mode)\n"); #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif pid = mips_rd_entryhi() & TLBHI_ASID_MASK; printf("badaddr = %#jx, pc = %#jx, ra = %#jx, sp = %#jx, sr = %jx, pid = %d, ASID = %u\n", (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra, (intmax_t)trapframe->sp, (intmax_t)trapframe->sr, (curproc ? curproc->p_pid : -1), pid); switch (type & ~T_USER) { case T_TLB_MOD: case T_TLB_LD_MISS: case T_TLB_ST_MISS: case T_ADDR_ERR_LD: case T_ADDR_ERR_ST: this_badvaddr = trapframe->badvaddr; break; case T_SYSCALL: this_badvaddr = trapframe->ra; break; default: this_badvaddr = trapframe->pc; break; } if ((last_badvaddr == this_badvaddr) && ((type & ~T_USER) != T_SYSCALL) && ((type & ~T_USER) != T_COP_UNUSABLE)) { if (++count == 3) { trap_frame_dump(trapframe); panic("too many faults at %p\n", (void *)last_badvaddr); } } else { last_badvaddr = this_badvaddr; count = 0; } } #endif #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ /* * XXXDTRACE: add pid probe handler here (if ever) */ if (!usermode) { if (dtrace_trap_func != NULL && (*dtrace_trap_func)(trapframe, type) != 0) return (trapframe->pc); } #endif switch (type) { case T_MCHECK: #ifdef DDB kdb_trap(type, 0, trapframe); #endif panic("MCHECK\n"); break; case T_TLB_MOD: /* check for kernel address */ if (KERNLAND(trapframe->badvaddr)) { if (pmap_emulate_modified(kernel_pmap, trapframe->badvaddr) != 0) { ftype = VM_PROT_WRITE; goto kernel_fault; } return (trapframe->pc); } /* FALLTHROUGH */ case T_TLB_MOD + T_USER: pmap = &p->p_vmspace->vm_pmap; if (pmap_emulate_modified(pmap, trapframe->badvaddr) != 0) { ftype = VM_PROT_WRITE; goto dofault; } if (!usermode) return (trapframe->pc); goto out; case T_TLB_LD_MISS: case T_TLB_ST_MISS: ftype = (type == T_TLB_ST_MISS) ? VM_PROT_WRITE : VM_PROT_READ; /* check for kernel address */ if (KERNLAND(trapframe->badvaddr)) { vm_offset_t va; int rv; kernel_fault: va = trunc_page((vm_offset_t)trapframe->badvaddr); rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) return (trapframe->pc); if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } goto err; } /* * It is an error for the kernel to access user space except * through the copyin/copyout routines. */ if (td->td_pcb->pcb_onfault == NULL) goto err; /* check for fuswintr() or suswintr() getting a page fault */ /* XXX There must be a nicer way to do this. */ if (td->td_pcb->pcb_onfault == fswintrberr) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } goto dofault; case T_TLB_LD_MISS + T_USER: ftype = VM_PROT_READ; goto dofault; case T_TLB_ST_MISS + T_USER: ftype = VM_PROT_WRITE; dofault: { vm_offset_t va; struct vmspace *vm; vm_map_t map; int rv = 0; vm = p->p_vmspace; map = &vm->vm_map; va = trunc_page((vm_offset_t)trapframe->badvaddr); if (KERNLAND(trapframe->badvaddr)) { /* * Don't allow user-mode faults in kernel * address space. */ goto nogo; } rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ #ifdef VMFAULT_TRACE printf("vm_fault(%p (pmap %p), %p (%p), %x, %d) -> %x at pc %p\n", map, &vm->vm_pmap, (void *)va, (void *)(intptr_t)trapframe->badvaddr, ftype, VM_FAULT_NORMAL, rv, (void *)(intptr_t)trapframe->pc); #endif if (rv == KERN_SUCCESS) { if (!usermode) { return (trapframe->pc); } goto out; } nogo: if (!usermode) { if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } goto err; } i = SIGSEGV; if (rv == KERN_PROTECTION_FAILURE) ucode = SEGV_ACCERR; else ucode = SEGV_MAPERR; addr = trapframe->pc; msg = "BAD_PAGE_FAULT"; log_bad_page_fault(msg, trapframe, type); break; } case T_ADDR_ERR_LD + T_USER: /* misaligned or kseg access */ case T_ADDR_ERR_ST + T_USER: /* misaligned or kseg access */ if (trapframe->badvaddr < 0 || trapframe->badvaddr >= VM_MAXUSER_ADDRESS) { msg = "ADDRESS_SPACE_ERR"; } else if (allow_unaligned_acc) { int mode; if (type == (T_ADDR_ERR_LD + T_USER)) mode = VM_PROT_READ; else mode = VM_PROT_WRITE; access_type = emulate_unaligned_access(trapframe, mode); if (access_type != 0) goto out; msg = "ALIGNMENT_FIX_ERR"; } else { msg = "ADDRESS_ERR"; } /* FALL THROUGH */ case T_BUS_ERR_IFETCH + T_USER: /* BERR asserted to cpu */ case T_BUS_ERR_LD_ST + T_USER: /* BERR asserted to cpu */ ucode = 0; /* XXX should be VM_PROT_something */ i = SIGBUS; addr = trapframe->pc; if (!msg) msg = "BUS_ERR"; log_bad_page_fault(msg, trapframe, type); break; case T_SYSCALL + T_USER: { - struct syscall_args sa; int error; - sa.trapframe = trapframe; - error = syscallenter(td, &sa); + td->td_sa.trapframe = trapframe; + error = syscallenter(td); #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) if (trp == trapdebug) - trapdebug[TRAPSIZE - 1].code = sa.code; + trapdebug[TRAPSIZE - 1].code = td->td_sa.code; else - trp[-1].code = sa.code; + trp[-1].code = td->td_sa.code; #endif - trapdebug_enter(td->td_frame, -sa.code); + trapdebug_enter(td->td_frame, -td->td_sa.code); /* * The sync'ing of I & D caches for SYS_ptrace() is * done by procfs_domem() through procfs_rwmem() * instead of being done here under a special check * for SYS_ptrace(). */ - syscallret(td, error, &sa); + syscallret(td, error); return (trapframe->pc); } #if defined(KDTRACE_HOOKS) || defined(DDB) case T_BREAK: #ifdef KDTRACE_HOOKS if (!usermode && dtrace_invop_jump_addr != 0) { dtrace_invop_jump_addr(trapframe); return (trapframe->pc); } #endif #ifdef DDB kdb_trap(type, 0, trapframe); return (trapframe->pc); #endif #endif case T_BREAK + T_USER: { intptr_t va; uint32_t instr; /* compute address of break instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); /* read break instruction */ instr = fuword32((caddr_t)va); #if 0 printf("trap: %s (%d) breakpoint %x at %x: (adr %x ins %x)\n", p->p_comm, p->p_pid, instr, trapframe->pc, p->p_md.md_ss_addr, p->p_md.md_ss_instr); /* XXX */ #endif if (td->td_md.md_ss_addr != va || instr != MIPS_BREAK_SSTEP) { i = SIGTRAP; addr = trapframe->pc; break; } /* * The restoration of the original instruction and * the clearing of the breakpoint will be done later * by the call to ptrace_clear_single_step() in * issignal() when SIGTRAP is processed. */ addr = trapframe->pc; i = SIGTRAP; break; } case T_IWATCH + T_USER: case T_DWATCH + T_USER: { intptr_t va; /* compute address of trapped instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); printf("watch exception @ %p\n", (void *)va); i = SIGTRAP; addr = va; break; } case T_TRAP + T_USER: { intptr_t va; uint32_t instr; struct trapframe *locr0 = td->td_frame; /* compute address of trap instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); /* read break instruction */ instr = fuword32((caddr_t)va); if (DELAYBRANCH(trapframe->cause)) { /* Check BD bit */ locr0->pc = MipsEmulateBranch(locr0, trapframe->pc, 0, 0); } else { locr0->pc += sizeof(int); } addr = va; i = SIGEMT; /* Stuff it with something for now */ break; } case T_RES_INST + T_USER: { InstFmt inst; inst = *(InstFmt *)(intptr_t)trapframe->pc; switch (inst.RType.op) { case OP_SPECIAL3: switch (inst.RType.func) { case OP_RDHWR: /* Register 29 used for TLS */ if (inst.RType.rd == 29) { frame_regs = &(trapframe->zero); frame_regs[inst.RType.rt] = (register_t)(intptr_t)td->td_md.md_tls; frame_regs[inst.RType.rt] += td->td_md.md_tls_tcb_offset; trapframe->pc += sizeof(int); goto out; } break; } break; } log_illegal_instruction("RES_INST", trapframe); i = SIGILL; addr = trapframe->pc; } break; case T_C2E: case T_C2E + T_USER: goto err; break; case T_COP_UNUSABLE: #ifdef CPU_CNMIPS cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT; /* Handle only COP2 exception */ if (cop != 2) goto err; addr = trapframe->pc; /* save userland cop2 context if it has been touched */ if ((td->td_md.md_flags & MDTD_COP2USED) && (td->td_md.md_cop2owner == COP2_OWNER_USERLAND)) { if (td->td_md.md_ucop2) octeon_cop2_save(td->td_md.md_ucop2); else panic("COP2 was used in user mode but md_ucop2 is NULL"); } if (td->td_md.md_cop2 == NULL) { td->td_md.md_cop2 = octeon_cop2_alloc_ctx(); if (td->td_md.md_cop2 == NULL) panic("Failed to allocate COP2 context"); memset(td->td_md.md_cop2, 0, sizeof(*td->td_md.md_cop2)); } octeon_cop2_restore(td->td_md.md_cop2); /* Make userland re-request its context */ td->td_frame->sr &= ~MIPS_SR_COP_2_BIT; td->td_md.md_flags |= MDTD_COP2USED; td->td_md.md_cop2owner = COP2_OWNER_KERNEL; /* Enable COP2, it will be disabled in cpu_switch */ mips_wr_status(mips_rd_status() | MIPS_SR_COP_2_BIT); return (trapframe->pc); #else goto err; break; #endif case T_COP_UNUSABLE + T_USER: cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT; if (cop == 1) { #if !defined(CPU_HAVEFPU) /* FP (COP1) instruction */ log_illegal_instruction("COP1_UNUSABLE", trapframe); i = SIGILL; break; #else addr = trapframe->pc; MipsSwitchFPState(PCPU_GET(fpcurthread), td->td_frame); PCPU_SET(fpcurthread, td); #if defined(__mips_n64) td->td_frame->sr |= MIPS_SR_COP_1_BIT | MIPS_SR_FR; #else td->td_frame->sr |= MIPS_SR_COP_1_BIT; #endif td->td_md.md_flags |= MDTD_FPUSED; goto out; #endif } #ifdef CPU_CNMIPS else if (cop == 2) { addr = trapframe->pc; if ((td->td_md.md_flags & MDTD_COP2USED) && (td->td_md.md_cop2owner == COP2_OWNER_KERNEL)) { if (td->td_md.md_cop2) octeon_cop2_save(td->td_md.md_cop2); else panic("COP2 was used in kernel mode but md_cop2 is NULL"); } if (td->td_md.md_ucop2 == NULL) { td->td_md.md_ucop2 = octeon_cop2_alloc_ctx(); if (td->td_md.md_ucop2 == NULL) panic("Failed to allocate userland COP2 context"); memset(td->td_md.md_ucop2, 0, sizeof(*td->td_md.md_ucop2)); } octeon_cop2_restore(td->td_md.md_ucop2); td->td_frame->sr |= MIPS_SR_COP_2_BIT; td->td_md.md_flags |= MDTD_COP2USED; td->td_md.md_cop2owner = COP2_OWNER_USERLAND; goto out; } #endif else { log_illegal_instruction("COPn_UNUSABLE", trapframe); i = SIGILL; /* only FPU instructions allowed */ break; } case T_FPE: #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) trapDump("fpintr"); #else printf("FPU Trap: PC %#jx CR %x SR %x\n", (intmax_t)trapframe->pc, (unsigned)trapframe->cause, (unsigned)trapframe->sr); goto err; #endif case T_FPE + T_USER: if (!emulate_fp) { i = SIGFPE; addr = trapframe->pc; break; } MipsFPTrap(trapframe->sr, trapframe->cause, trapframe->pc); goto out; case T_OVFLOW + T_USER: i = SIGFPE; addr = trapframe->pc; break; case T_ADDR_ERR_LD: /* misaligned access */ case T_ADDR_ERR_ST: /* misaligned access */ #ifdef TRAP_DEBUG if (trap_debug) { printf("+++ ADDR_ERR: type = %d, badvaddr = %#jx\n", type, (intmax_t)trapframe->badvaddr); } #endif /* Only allow emulation on a user address */ if (allow_unaligned_acc && ((vm_offset_t)trapframe->badvaddr < VM_MAXUSER_ADDRESS)) { int mode; if (type == T_ADDR_ERR_LD) mode = VM_PROT_READ; else mode = VM_PROT_WRITE; access_type = emulate_unaligned_access(trapframe, mode); if (access_type != 0) return (trapframe->pc); } /* FALLTHROUGH */ case T_BUS_ERR_LD_ST: /* BERR asserted to cpu */ if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } /* FALLTHROUGH */ default: err: #if !defined(SMP) && defined(DEBUG) trapDump("trap"); #endif #ifdef SMP printf("cpu:%d-", PCPU_GET(cpuid)); #endif printf("Trap cause = %d (%s - ", type, trap_type[type & (~T_USER)]); if (type & T_USER) printf("user mode)\n"); else printf("kernel mode)\n"); #ifdef TRAP_DEBUG if (trap_debug) printf("badvaddr = %#jx, pc = %#jx, ra = %#jx, sr = %#jxx\n", (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra, (intmax_t)trapframe->sr); #endif #ifdef KDB if (debugger_on_panic || kdb_active) { kdb_trap(type, 0, trapframe); } #endif panic("trap"); } td->td_frame->pc = trapframe->pc; td->td_frame->cause = trapframe->cause; td->td_frame->badvaddr = trapframe->badvaddr; ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; trapsignal(td, &ksi); out: /* * Note: we should only get here if returning to user mode. */ userret(td, trapframe); return (trapframe->pc); } #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) void trapDump(char *msg) { register_t s; int i; s = intr_disable(); printf("trapDump(%s)\n", msg); for (i = 0; i < TRAPSIZE; i++) { if (trp == trapdebug) { trp = &trapdebug[TRAPSIZE - 1]; } else { trp--; } if (trp->cause == 0) break; printf("%s: ADR %jx PC %jx CR %jx SR %jx\n", trap_type[(trp->cause & MIPS_CR_EXC_CODE) >> MIPS_CR_EXC_CODE_SHIFT], (intmax_t)trp->vadr, (intmax_t)trp->pc, (intmax_t)trp->cause, (intmax_t)trp->status); printf(" RA %jx SP %jx code %d\n", (intmax_t)trp->ra, (intmax_t)trp->sp, (int)trp->code); } intr_restore(s); } #endif /* * Return the resulting PC as if the branch was executed. */ uintptr_t MipsEmulateBranch(struct trapframe *framePtr, uintptr_t instPC, int fpcCSR, uintptr_t instptr) { InstFmt inst; register_t *regsPtr = (register_t *) framePtr; uintptr_t retAddr = 0; int condition; #define GetBranchDest(InstPtr, inst) \ (InstPtr + 4 + ((short)inst.IType.imm << 2)) if (instptr) { if (instptr < MIPS_KSEG0_START) inst.word = fuword32((void *)instptr); else inst = *(InstFmt *) instptr; } else { if ((vm_offset_t)instPC < MIPS_KSEG0_START) inst.word = fuword32((void *)instPC); else inst = *(InstFmt *) instPC; } switch ((int)inst.JType.op) { case OP_SPECIAL: switch ((int)inst.RType.func) { case OP_JR: case OP_JALR: retAddr = regsPtr[inst.RType.rs]; break; default: retAddr = instPC + 4; break; } break; case OP_BCOND: switch ((int)inst.IType.rt) { case OP_BLTZ: case OP_BLTZL: case OP_BLTZAL: case OP_BLTZALL: if ((int)(regsPtr[inst.RType.rs]) < 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BGEZ: case OP_BGEZL: case OP_BGEZAL: case OP_BGEZALL: if ((int)(regsPtr[inst.RType.rs]) >= 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_TGEI: case OP_TGEIU: case OP_TLTI: case OP_TLTIU: case OP_TEQI: case OP_TNEI: retAddr = instPC + 4; /* Like syscall... */ break; default: panic("MipsEmulateBranch: Bad branch cond"); } break; case OP_J: case OP_JAL: retAddr = (inst.JType.target << 2) | ((unsigned)(instPC + 4) & 0xF0000000); break; case OP_BEQ: case OP_BEQL: if (regsPtr[inst.RType.rs] == regsPtr[inst.RType.rt]) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BNE: case OP_BNEL: if (regsPtr[inst.RType.rs] != regsPtr[inst.RType.rt]) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BLEZ: case OP_BLEZL: if ((int)(regsPtr[inst.RType.rs]) <= 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BGTZ: case OP_BGTZL: if ((int)(regsPtr[inst.RType.rs]) > 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_COP1: switch (inst.RType.rs) { case OP_BCx: case OP_BCy: if ((inst.RType.rt & COPz_BC_TF_MASK) == COPz_BC_TRUE) condition = fpcCSR & MIPS_FPU_COND_BIT; else condition = !(fpcCSR & MIPS_FPU_COND_BIT); if (condition) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; default: retAddr = instPC + 4; } break; default: retAddr = instPC + 4; } return (retAddr); } static void log_frame_dump(struct trapframe *frame) { log(LOG_ERR, "Trapframe Register Dump:\n"); log(LOG_ERR, "\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n", (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1); log(LOG_ERR, "\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n", (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3); #if defined(__mips_n32) || defined(__mips_n64) log(LOG_ERR, "\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta6: %#jx\n", (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7); log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); #else log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); log(LOG_ERR, "\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n", (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7); #endif log(LOG_ERR, "\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n", (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1); log(LOG_ERR, "\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n", (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5); log(LOG_ERR, "\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n", (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1); log(LOG_ERR, "\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n", (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra); log(LOG_ERR, "\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n", (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr); log(LOG_ERR, "\tcause: %#jx\tpc: %#jx\n", (intmax_t)frame->cause, (intmax_t)frame->pc); } #ifdef TRAP_DEBUG static void trap_frame_dump(struct trapframe *frame) { printf("Trapframe Register Dump:\n"); printf("\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n", (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1); printf("\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n", (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3); #if defined(__mips_n32) || defined(__mips_n64) printf("\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta7: %#jx\n", (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7); printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); #else printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); printf("\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n", (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7); #endif printf("\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n", (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1); printf("\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n", (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5); printf("\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n", (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1); printf("\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n", (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra); printf("\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n", (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr); printf("\tcause: %#jx\tpc: %#jx\n", (intmax_t)frame->cause, (intmax_t)frame->pc); } #endif static void get_mapping_info(vm_offset_t va, pd_entry_t **pdepp, pt_entry_t **ptepp) { pt_entry_t *ptep; pd_entry_t *pdep; struct proc *p = curproc; pdep = (&(p->p_vmspace->vm_pmap.pm_segtab[(va >> SEGSHIFT) & (NPDEPG - 1)])); if (*pdep) ptep = pmap_pte(&p->p_vmspace->vm_pmap, va); else ptep = (pt_entry_t *)0; *pdepp = pdep; *ptepp = ptep; } static void log_illegal_instruction(const char *msg, struct trapframe *frame) { pt_entry_t *ptep; pd_entry_t *pdep; unsigned int *addr; struct thread *td; struct proc *p; register_t pc; td = curthread; p = td->td_proc; #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx ra %#jx\n", msg, p->p_pid, (long)td->td_tid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, (intmax_t)pc, (intmax_t)frame->ra); /* log registers in trap frame */ log_frame_dump(frame); get_mapping_info((vm_offset_t)pc, &pdep, &ptep); /* * Dump a few words around faulting instruction, if the addres is * valid. */ if (!(pc & 3) && useracc((caddr_t)(intptr_t)pc, sizeof(int) * 4, VM_PROT_READ)) { /* dump page table entry for faulting instruction */ log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); addr = (unsigned int *)(intptr_t)pc; log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n", addr); log(LOG_ERR, "%08x %08x %08x %08x\n", addr[0], addr[1], addr[2], addr[3]); } else { log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } } static void log_bad_page_fault(char *msg, struct trapframe *frame, int trap_type) { pt_entry_t *ptep; pd_entry_t *pdep; unsigned int *addr; struct thread *td; struct proc *p; char *read_or_write; register_t pc; trap_type &= ~T_USER; td = curthread; p = td->td_proc; #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif switch (trap_type) { case T_TLB_MOD: case T_TLB_ST_MISS: case T_ADDR_ERR_ST: read_or_write = "write"; break; case T_TLB_LD_MISS: case T_ADDR_ERR_LD: case T_BUS_ERR_IFETCH: read_or_write = "read"; break; default: read_or_write = "unknown"; } pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx got a %s fault " "(type %#x) at %#jx\n", msg, p->p_pid, (long)td->td_tid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, (intmax_t)pc, read_or_write, trap_type, (intmax_t)frame->badvaddr); /* log registers in trap frame */ log_frame_dump(frame); get_mapping_info((vm_offset_t)pc, &pdep, &ptep); /* * Dump a few words around faulting instruction, if the addres is * valid. */ if (!(pc & 3) && (pc != frame->badvaddr) && (trap_type != T_BUS_ERR_IFETCH) && useracc((caddr_t)(intptr_t)pc, sizeof(int) * 4, VM_PROT_READ)) { /* dump page table entry for faulting instruction */ log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); addr = (unsigned int *)(intptr_t)pc; log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n", addr); log(LOG_ERR, "%08x %08x %08x %08x\n", addr[0], addr[1], addr[2], addr[3]); } else { log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } get_mapping_info((vm_offset_t)frame->badvaddr, &pdep, &ptep); log(LOG_ERR, "Page table info for bad address %#jx: pde = %p, pte = %#jx\n", (intmax_t)frame->badvaddr, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } /* * Unaligned load/store emulation */ static int mips_unaligned_load_store(struct trapframe *frame, int mode, register_t addr, register_t pc) { register_t *reg = (register_t *) frame; u_int32_t inst = *((u_int32_t *)(intptr_t)pc); register_t value_msb, value; unsigned size; /* * ADDR_ERR faults have higher priority than TLB * Miss faults. Therefore, it is necessary to * verify that the faulting address is a valid * virtual address within the process' address space * before trying to emulate the unaligned access. */ switch (MIPS_INST_OPCODE(inst)) { case OP_LHU: case OP_LH: case OP_SH: size = 2; break; case OP_LWU: case OP_LW: case OP_SW: size = 4; break; case OP_LD: case OP_SD: size = 8; break; default: printf("%s: unhandled opcode in address error: %#x\n", __func__, MIPS_INST_OPCODE(inst)); return (0); } if (!useracc((void *)rounddown2((vm_offset_t)addr, size), size * 2, mode)) return (0); /* * XXX * Handle LL/SC LLD/SCD. */ switch (MIPS_INST_OPCODE(inst)) { case OP_LHU: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lbu_macro(value_msb, addr); addr += 1; lbu_macro(value, addr); value |= value_msb << 8; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LHU_ACCESS); case OP_LH: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lb_macro(value_msb, addr); addr += 1; lbu_macro(value, addr); value |= value_msb << 8; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LH_ACCESS); case OP_LWU: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lwl_macro(value, addr); addr += 3; lwr_macro(value, addr); value &= 0xffffffff; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LWU_ACCESS); case OP_LW: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lwl_macro(value, addr); addr += 3; lwr_macro(value, addr); reg[MIPS_INST_RT(inst)] = value; return (MIPS_LW_ACCESS); #if defined(__mips_n32) || defined(__mips_n64) case OP_LD: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); ldl_macro(value, addr); addr += 7; ldr_macro(value, addr); reg[MIPS_INST_RT(inst)] = value; return (MIPS_LD_ACCESS); #endif case OP_SH: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; value_msb = value >> 8; sb_macro(value_msb, addr); addr += 1; sb_macro(value, addr); return (MIPS_SH_ACCESS); case OP_SW: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; swl_macro(value, addr); addr += 3; swr_macro(value, addr); return (MIPS_SW_ACCESS); #if defined(__mips_n32) || defined(__mips_n64) case OP_SD: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; sdl_macro(value, addr); addr += 7; sdr_macro(value, addr); return (MIPS_SD_ACCESS); #endif } panic("%s: should not be reached.", __func__); } /* * XXX TODO: SMP? */ static struct timeval unaligned_lasterr; static int unaligned_curerr; static int unaligned_pps_log_limit = 4; SYSCTL_INT(_machdep, OID_AUTO, unaligned_log_pps_limit, CTLFLAG_RWTUN, &unaligned_pps_log_limit, 0, "limit number of userland unaligned log messages per second"); static int emulate_unaligned_access(struct trapframe *frame, int mode) { register_t pc; int access_type = 0; struct thread *td = curthread; struct proc *p = curproc; pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); /* * Fall through if it's instruction fetch exception */ if (!((pc & 3) || (pc == frame->badvaddr))) { /* * Handle unaligned load and store */ /* * Return access type if the instruction was emulated. * Otherwise restore pc and fall through. */ access_type = mips_unaligned_load_store(frame, mode, frame->badvaddr, pc); if (access_type) { if (DELAYBRANCH(frame->cause)) frame->pc = MipsEmulateBranch(frame, frame->pc, 0, 0); else frame->pc += 4; if (ppsratecheck(&unaligned_lasterr, &unaligned_curerr, unaligned_pps_log_limit)) { /* XXX TODO: keep global/tid/pid counters? */ log(LOG_INFO, "Unaligned %s: pid=%ld (%s), tid=%ld, " "pc=%#jx, badvaddr=%#jx\n", access_name[access_type - 1], (long) p->p_pid, p->p_comm, (long) td->td_tid, (intmax_t)pc, (intmax_t)frame->badvaddr); } } } return access_type; } Index: head/sys/powerpc/powerpc/trap.c =================================================================== --- head/sys/powerpc/powerpc/trap.c (revision 319872) +++ head/sys/powerpc/powerpc/trap.c (revision 319873) @@ -1,881 +1,882 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: trap.c,v 1.58 2002/03/04 04:07:35 dbj Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Below matches setjmp.S */ #define FAULTBUF_LR 21 #define FAULTBUF_R1 1 #define FAULTBUF_R2 2 #define FAULTBUF_CR 22 #define FAULTBUF_R14 3 #define MOREARGS(sp) ((caddr_t)((uintptr_t)(sp) + \ sizeof(struct callframe) - 3*sizeof(register_t))) /* more args go here */ static void trap_fatal(struct trapframe *frame); static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user); static int trap_pfault(struct trapframe *frame, int user); static int fix_unaligned(struct thread *td, struct trapframe *frame); static int handle_onfault(struct trapframe *frame); static void syscall(struct trapframe *frame); #if defined(__powerpc64__) && defined(AIM) void handle_kernel_slb_spill(int, register_t, register_t); static int handle_user_slb_spill(pmap_t pm, vm_offset_t addr); extern int n_slbs; #endif #ifdef KDB int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ #endif struct powerpc_exception { u_int vector; char *name; }; #ifdef KDTRACE_HOOKS #include int (*dtrace_invop_jump_addr)(struct trapframe *); #endif static struct powerpc_exception powerpc_exceptions[] = { { EXC_CRIT, "critical input" }, { EXC_RST, "system reset" }, { EXC_MCHK, "machine check" }, { EXC_DSI, "data storage interrupt" }, { EXC_DSE, "data segment exception" }, { EXC_ISI, "instruction storage interrupt" }, { EXC_ISE, "instruction segment exception" }, { EXC_EXI, "external interrupt" }, { EXC_ALI, "alignment" }, { EXC_PGM, "program" }, { EXC_FPU, "floating-point unavailable" }, { EXC_APU, "auxiliary proc unavailable" }, { EXC_DECR, "decrementer" }, { EXC_FIT, "fixed-interval timer" }, { EXC_WDOG, "watchdog timer" }, { EXC_SC, "system call" }, { EXC_TRC, "trace" }, { EXC_FPA, "floating-point assist" }, { EXC_DEBUG, "debug" }, { EXC_PERF, "performance monitoring" }, { EXC_VEC, "altivec unavailable" }, { EXC_VSX, "vsx unavailable" }, { EXC_ITMISS, "instruction tlb miss" }, { EXC_DLMISS, "data load tlb miss" }, { EXC_DSMISS, "data store tlb miss" }, { EXC_BPT, "instruction breakpoint" }, { EXC_SMI, "system management" }, { EXC_VECAST_G4, "altivec assist" }, { EXC_THRM, "thermal management" }, { EXC_RUNMODETRC, "run mode/trace" }, { EXC_LAST, NULL } }; static const char * trapname(u_int vector) { struct powerpc_exception *pe; for (pe = powerpc_exceptions; pe->vector != EXC_LAST; pe++) { if (pe->vector == vector) return (pe->name); } return ("unknown"); } void trap(struct trapframe *frame) { struct thread *td; struct proc *p; #ifdef KDTRACE_HOOKS uint32_t inst; #endif int sig, type, user; u_int ucode; ksiginfo_t ksi; VM_CNT_INC(v_trap); td = curthread; p = td->td_proc; type = ucode = frame->exc; sig = 0; user = frame->srr1 & PSL_PR; CTR3(KTR_TRAP, "trap: %s type=%s (%s)", td->td_name, trapname(type), user ? "user" : "kernel"); #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type) != 0) return; #endif if (user) { td->td_pticks = 0; td->td_frame = frame; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); /* User Mode Traps */ switch (type) { case EXC_RUNMODETRC: case EXC_TRC: frame->srr1 &= ~PSL_SE; sig = SIGTRAP; ucode = TRAP_TRACE; break; #if defined(__powerpc64__) && defined(AIM) case EXC_ISE: case EXC_DSE: if (handle_user_slb_spill(&p->p_vmspace->vm_pmap, (type == EXC_ISE) ? frame->srr0 : frame->dar) != 0){ sig = SIGSEGV; ucode = SEGV_MAPERR; } break; #endif case EXC_DSI: case EXC_ISI: sig = trap_pfault(frame, 1); if (sig == SIGSEGV) ucode = SEGV_MAPERR; break; case EXC_SC: syscall(frame); break; case EXC_FPU: KASSERT((td->td_pcb->pcb_flags & PCB_FPU) != PCB_FPU, ("FPU already enabled for thread")); enable_fpu(td); break; case EXC_VEC: KASSERT((td->td_pcb->pcb_flags & PCB_VEC) != PCB_VEC, ("Altivec already enabled for thread")); enable_vec(td); break; case EXC_VSX: KASSERT((td->td_pcb->pcb_flags & PCB_VSX) != PCB_VSX, ("VSX already enabled for thread")); if (!(td->td_pcb->pcb_flags & PCB_VEC)) enable_vec(td); if (!(td->td_pcb->pcb_flags & PCB_FPU)) save_fpu(td); td->td_pcb->pcb_flags |= PCB_VSX; enable_fpu(td); break; case EXC_VECAST_E: case EXC_VECAST_G4: case EXC_VECAST_G5: /* * We get a VPU assist exception for IEEE mode * vector operations on denormalized floats. * Emulating this is a giant pain, so for now, * just switch off IEEE mode and treat them as * zero. */ save_vec(td); td->td_pcb->pcb_vec.vscr |= ALTIVEC_VSCR_NJ; enable_vec(td); break; case EXC_ALI: if (fix_unaligned(td, frame) != 0) { sig = SIGBUS; ucode = BUS_ADRALN; } else frame->srr0 += 4; break; case EXC_DEBUG: /* Single stepping */ mtspr(SPR_DBSR, mfspr(SPR_DBSR)); frame->srr1 &= ~PSL_DE; frame->cpu.booke.dbcr0 &= ~(DBCR0_IDM | DBCR0_IC); sig = SIGTRAP; ucode = TRAP_TRACE; break; case EXC_PGM: /* Identify the trap reason */ #ifdef AIM if (frame->srr1 & EXC_PGM_TRAP) { #else if (frame->cpu.booke.esr & ESR_PTR) { #endif #ifdef KDTRACE_HOOKS inst = fuword32((const void *)frame->srr0); if (inst == 0x0FFFDDDD && dtrace_pid_probe_ptr != NULL) { struct reg regs; fill_regs(td, ®s); (*dtrace_pid_probe_ptr)(®s); break; } #endif sig = SIGTRAP; ucode = TRAP_BRKPT; } else { sig = ppc_instr_emulate(frame, td->td_pcb); if (sig == SIGILL) { if (frame->srr1 & EXC_PGM_PRIV) ucode = ILL_PRVOPC; else if (frame->srr1 & EXC_PGM_ILLEGAL) ucode = ILL_ILLOPC; } else if (sig == SIGFPE) ucode = FPE_FLTINV; /* Punt for now, invalid operation. */ } break; case EXC_MCHK: /* * Note that this may not be recoverable for the user * process, depending on the type of machine check, * but it at least prevents the kernel from dying. */ sig = SIGBUS; ucode = BUS_OBJERR; break; default: trap_fatal(frame); } } else { /* Kernel Mode Traps */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case EXC_PGM: #ifdef KDTRACE_HOOKS #ifdef AIM if (frame->srr1 & EXC_PGM_TRAP) { #else if (frame->cpu.booke.esr & ESR_PTR) { #endif if (*(uint32_t *)frame->srr0 == EXC_DTRACE) { if (dtrace_invop_jump_addr != NULL) { dtrace_invop_jump_addr(frame); return; } } } #endif #ifdef KDB if (db_trap_glue(frame)) return; #endif break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: if ((frame->dar & SEGMENT_MASK) == USER_ADDR) { __asm __volatile ("slbmte %0, %1" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); return; } break; #endif case EXC_DSI: if (trap_pfault(frame, 0) == 0) return; break; case EXC_MCHK: if (handle_onfault(frame)) return; break; default: break; } trap_fatal(frame); } if (sig != 0) { if (p->p_sysent->sv_transtrap != NULL) sig = (p->p_sysent->sv_transtrap)(sig, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int) ucode; /* XXX, not POSIX */ /* ksi.ksi_addr = ? */ ksi.ksi_trapno = type; trapsignal(td, &ksi); } userret(td, frame); } static void trap_fatal(struct trapframe *frame) { printtrap(frame->exc, frame, 1, (frame->srr1 & PSL_PR)); #ifdef KDB if ((debugger_on_panic || kdb_active) && kdb_trap(frame->exc, 0, frame)) return; #endif panic("%s trap", trapname(frame->exc)); } static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { uint16_t ver; #ifdef BOOKE vm_paddr_t pa; #endif printf("\n"); printf("%s %s trap:\n", isfatal ? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" exception = 0x%x (%s)\n", vector, trapname(vector)); switch (vector) { case EXC_DSE: case EXC_DSI: case EXC_DTMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar); #ifdef AIM printf(" dsisr = 0x%lx\n", (u_long)frame->cpu.aim.dsisr); #endif break; case EXC_ISE: case EXC_ISI: case EXC_ITMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->srr0); break; case EXC_MCHK: ver = mfpvr() >> 16; #if defined(AIM) if (MPC745X_P(ver)) printf(" msssr0 = 0x%lx\n", (u_long)mfspr(SPR_MSSSR0)); #elif defined(BOOKE) pa = mfspr(SPR_MCARU); pa = (pa << 32) | (u_register_t)mfspr(SPR_MCAR); printf(" mcsr = 0x%lx\n", (u_long)mfspr(SPR_MCSR)); printf(" mcar = 0x%jx\n", (uintmax_t)pa); #endif break; } #ifdef BOOKE printf(" esr = 0x%" PRIxPTR "\n", frame->cpu.booke.esr); #endif printf(" srr0 = 0x%" PRIxPTR "\n", frame->srr0); printf(" srr1 = 0x%lx\n", (u_long)frame->srr1); printf(" lr = 0x%" PRIxPTR "\n", frame->lr); printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_name); printf("\n"); } /* * Handles a fatal fault when we have onfault state to recover. Returns * non-zero if there was onfault recovery state available. */ static int handle_onfault(struct trapframe *frame) { struct thread *td; jmp_buf *fb; td = curthread; fb = td->td_pcb->pcb_onfault; if (fb != NULL) { frame->srr0 = (*fb)->_jb[FAULTBUF_LR]; frame->fixreg[1] = (*fb)->_jb[FAULTBUF_R1]; frame->fixreg[2] = (*fb)->_jb[FAULTBUF_R2]; frame->fixreg[3] = 1; frame->cr = (*fb)->_jb[FAULTBUF_CR]; bcopy(&(*fb)->_jb[FAULTBUF_R14], &frame->fixreg[14], 18 * sizeof(register_t)); td->td_pcb->pcb_onfault = NULL; /* Returns twice, not thrice */ return (1); } return (0); } int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; + struct syscall_args *sa; caddr_t params; size_t argsz; int error, n, i; p = td->td_proc; frame = td->td_frame; + sa = &td->td_sa; sa->code = frame->fixreg[0]; params = (caddr_t)(frame->fixreg + FIRSTARG); n = NARGREG; if (sa->code == SYS_syscall) { /* * code is first argument, * followed by actual args. */ sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, * so as to maintain quad alignment * for the rest of the args. */ if (SV_PROC_FLAG(p, SV_ILP32)) { params += sizeof(register_t); sa->code = *(register_t *) params; params += sizeof(register_t); n -= 2; } else { sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (SV_PROC_FLAG(p, SV_ILP32)) { argsz = sizeof(uint32_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i] & 0xffffffff; } else { argsz = sizeof(uint64_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i]; } if (sa->narg > n) error = copyin(MOREARGS(frame->fixreg[1]), sa->args + n, (sa->narg - n) * argsz); else error = 0; #ifdef __powerpc64__ if (SV_PROC_FLAG(p, SV_ILP32) && sa->narg > n) { /* Expand the size of arguments copied from the stack */ for (i = sa->narg; i >= n; i--) sa->args[i] = ((uint32_t *)(&sa->args[n]))[i-n]; } #endif if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->fixreg[FIRSTARG + 1]; } return (error); } #include "../../kern/subr_syscall.c" void syscall(struct trapframe *frame) { struct thread *td; - struct syscall_args sa; int error; td = curthread; td->td_frame = frame; #if defined(__powerpc64__) && defined(AIM) /* * Speculatively restore last user SLB segment, which we know is * invalid already, since we are likely to do copyin()/copyout(). */ __asm __volatile ("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #endif - error = syscallenter(td, &sa); - syscallret(td, error, &sa); + error = syscallenter(td); + syscallret(td, error); } #if defined(__powerpc64__) && defined(AIM) /* Handle kernel SLB faults -- runs in real mode, all seat belts off */ void handle_kernel_slb_spill(int type, register_t dar, register_t srr0) { struct slb *slbcache; uint64_t slbe, slbv; uint64_t esid, addr; int i; addr = (type == EXC_ISE) ? srr0 : dar; slbcache = PCPU_GET(slb); esid = (uintptr_t)addr >> ADDR_SR_SHFT; slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID; /* See if the hardware flushed this somehow (can happen in LPARs) */ for (i = 0; i < n_slbs; i++) if (slbcache[i].slbe == (slbe | (uint64_t)i)) return; /* Not in the map, needs to actually be added */ slbv = kernel_va_to_slbv(addr); if (slbcache[USER_SLB_SLOT].slbe == 0) { for (i = 0; i < n_slbs; i++) { if (i == USER_SLB_SLOT) continue; if (!(slbcache[i].slbe & SLBE_VALID)) goto fillkernslb; } if (i == n_slbs) slbcache[USER_SLB_SLOT].slbe = 1; } /* Sacrifice a random SLB entry that is not the user entry */ i = mftb() % n_slbs; if (i == USER_SLB_SLOT) i = (i+1) % n_slbs; fillkernslb: /* Write new entry */ slbcache[i].slbv = slbv; slbcache[i].slbe = slbe | (uint64_t)i; /* Trap handler will restore from cache on exit */ } static int handle_user_slb_spill(pmap_t pm, vm_offset_t addr) { struct slb *user_entry; uint64_t esid; int i; esid = (uintptr_t)addr >> ADDR_SR_SHFT; PMAP_LOCK(pm); user_entry = user_va_to_slb_entry(pm, addr); if (user_entry == NULL) { /* allocate_vsid auto-spills it */ (void)allocate_user_vsid(pm, esid, 0); } else { /* * Check that another CPU has not already mapped this. * XXX: Per-thread SLB caches would be better. */ for (i = 0; i < pm->pm_slb_len; i++) if (pm->pm_slb[i] == user_entry) break; if (i == pm->pm_slb_len) slb_insert_user(pm, user_entry); } PMAP_UNLOCK(pm); return (0); } #endif static int trap_pfault(struct trapframe *frame, int user) { vm_offset_t eva, va; struct thread *td; struct proc *p; vm_map_t map; vm_prot_t ftype; int rv; #ifdef AIM register_t user_sr; #endif td = curthread; p = td->td_proc; if (frame->exc == EXC_ISI) { eva = frame->srr0; ftype = VM_PROT_EXECUTE; if (frame->srr1 & SRR1_ISI_PFAULT) ftype |= VM_PROT_READ; } else { eva = frame->dar; #ifdef BOOKE if (frame->cpu.booke.esr & ESR_ST) #else if (frame->cpu.aim.dsisr & DSISR_STORE) #endif ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; } else { #ifdef BOOKE if (eva < VM_MAXUSER_ADDRESS) { #else if ((eva >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) { #endif map = &p->p_vmspace->vm_map; #ifdef AIM user_sr = td->td_pcb->pcb_cpu.aim.usr_segm; eva &= ADDR_PIDX | ADDR_POFF; eva |= user_sr << ADDR_SR_SHFT; #endif } else { map = kernel_map; } } va = trunc_page(eva); /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ if (rv == KERN_SUCCESS) return (0); if (!user && handle_onfault(frame)) return (0); return (SIGSEGV); } /* * For now, this only deals with the particular unaligned access case * that gcc tends to generate. Eventually it should handle all of the * possibilities that can happen on a 32-bit PowerPC in big-endian mode. */ static int fix_unaligned(struct thread *td, struct trapframe *frame) { struct thread *fputhread; #ifdef __SPE__ uint32_t inst; #endif int indicator, reg; double *fpr; #ifdef __SPE__ indicator = (frame->cpu.booke.esr & (ESR_ST|ESR_SPE)); if (indicator & ESR_SPE) { if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_SPE_REG(inst); fpr = (double *)td->td_pcb->pcb_vec.vr[reg]; fputhread = PCPU_GET(vecthread); /* Juggle the SPE to ensure that we've initialized * the registers, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_vec(fputhread); enable_vec(td); } save_vec(td); if (!(indicator & ESR_ST)) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); frame->fixreg[reg] = td->td_pcb->pcb_vec.vr[reg][1]; enable_vec(td); } else { td->td_pcb->pcb_vec.vr[reg][1] = frame->fixreg[reg]; if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); } #else indicator = EXC_ALI_OPCODE_INDICATOR(frame->cpu.aim.dsisr); switch (indicator) { case EXC_ALI_LFD: case EXC_ALI_STFD: reg = EXC_ALI_RST(frame->cpu.aim.dsisr); fpr = &td->td_pcb->pcb_fpu.fpr[reg].fpr; fputhread = PCPU_GET(fputhread); /* Juggle the FPU to ensure that we've initialized * the FPRs, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_fpu(fputhread); enable_fpu(td); } save_fpu(td); if (indicator == EXC_ALI_LFD) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); enable_fpu(td); } else { if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); break; } #endif return (-1); } #ifdef KDB int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC #ifdef AIM || (frame->exc == EXC_PGM && (frame->srr1 & EXC_PGM_TRAP)) #else || (frame->exc == EXC_DEBUG) || (frame->cpu.booke.esr & ESR_PTR) #endif || frame->exc == EXC_BPT || frame->exc == EXC_DSI)) { int type = frame->exc; /* Ignore DTrace traps. */ if (*(uint32_t *)frame->srr0 == EXC_DTRACE) return (0); #ifdef AIM if (type == EXC_PGM && (frame->srr1 & EXC_PGM_TRAP)) { #else if (type == EXC_DEBUG || (frame->cpu.booke.esr & ESR_PTR)) { #endif type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } #endif Index: head/sys/riscv/riscv/trap.c =================================================================== --- head/sys/riscv/riscv/trap.c (revision 319872) +++ head/sys/riscv/riscv/trap.c (revision 319873) @@ -1,386 +1,387 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif int (*dtrace_invop_jump_addr)(struct trapframe *); extern register_t fsu_intr_fault; /* Called from exception.S */ void do_trap_supervisor(struct trapframe *); void do_trap_user(struct trapframe *); static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; trapsignal(td, &ksi); } int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap; + struct syscall_args *sa; int nap; nap = NARGREG; p = td->td_proc; + sa = &td->td_sa; ap = &td->td_frame->tf_a[0]; sa->code = td->td_frame->tf_t[0]; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = *ap++; nap--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; memcpy(sa->args, ap, nap * sizeof(register_t)); if (sa->narg > nap) panic("TODO: Could we have more then %d args?", NARGREG); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" static void dump_regs(struct trapframe *frame) { int n; int i; n = (sizeof(frame->tf_t) / sizeof(frame->tf_t[0])); for (i = 0; i < n; i++) printf("t[%d] == 0x%016lx\n", i, frame->tf_t[i]); n = (sizeof(frame->tf_s) / sizeof(frame->tf_s[0])); for (i = 0; i < n; i++) printf("s[%d] == 0x%016lx\n", i, frame->tf_s[i]); n = (sizeof(frame->tf_a) / sizeof(frame->tf_a[0])); for (i = 0; i < n; i++) printf("a[%d] == 0x%016lx\n", i, frame->tf_a[i]); printf("sepc == 0x%016lx\n", frame->tf_sepc); printf("sstatus == 0x%016lx\n", frame->tf_sstatus); } static void svc_handler(struct trapframe *frame) { - struct syscall_args sa; struct thread *td; int error; td = curthread; td->td_frame = frame; - error = syscallenter(td, &sa); - syscallret(td, error, &sa); + error = syscallenter(td); + syscallret(td, error); } static void data_abort(struct trapframe *frame, int lower) { struct vm_map *map; uint64_t sbadaddr; struct thread *td; struct pcb *pcb; vm_prot_t ftype; vm_offset_t va; struct proc *p; int ucode; int error; int sig; #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif td = curthread; pcb = td->td_pcb; /* * Special case for fuswintr and suswintr. These can't sleep so * handle them early on in the trap handler. */ if (__predict_false(pcb->pcb_onfault == (vm_offset_t)&fsu_intr_fault)) { frame->tf_sepc = pcb->pcb_onfault; return; } sbadaddr = frame->tf_sbadaddr; p = td->td_proc; if (lower) map = &td->td_proc->p_vmspace->vm_map; else { /* The top bit tells us which range to use */ if ((sbadaddr >> 63) == 1) map = kernel_map; else map = &td->td_proc->p_vmspace->vm_map; } va = trunc_page(sbadaddr); if (frame->tf_scause == EXCP_FAULT_STORE) { ftype = (VM_PROT_READ | VM_PROT_WRITE); } else { ftype = (VM_PROT_READ); } if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (error != KERN_SUCCESS) { if (lower) { sig = SIGSEGV; if (error == KERN_PROTECTION_FAILURE) ucode = SEGV_ACCERR; else ucode = SEGV_MAPERR; call_trapsignal(td, sig, ucode, (void *)sbadaddr); } else { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != 0) { frame->tf_a[0] = error; frame->tf_sepc = pcb->pcb_onfault; return; } dump_regs(frame); panic("vm_fault failed: %lx, va 0x%016lx", frame->tf_sepc, sbadaddr); } } if (lower) userret(td, frame); } void do_trap_supervisor(struct trapframe *frame) { uint64_t exception; uint64_t sstatus; /* Ensure we came from supervisor mode, interrupts disabled */ __asm __volatile("csrr %0, sstatus" : "=&r" (sstatus)); KASSERT((sstatus & (SSTATUS_SPP | SSTATUS_SIE)) == SSTATUS_SPP, ("We must came from S mode with interrupts disabled")); exception = (frame->tf_scause & EXCP_MASK); if (frame->tf_scause & EXCP_INTR) { /* Interrupt */ riscv_cpu_intr(frame); return; } #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR3(KTR_TRAP, "do_trap_supervisor: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch(exception) { case EXCP_FAULT_LOAD: case EXCP_FAULT_STORE: case EXCP_FAULT_FETCH: data_abort(frame, 0); break; case EXCP_BREAKPOINT: #ifdef KDTRACE_HOOKS if (dtrace_invop_jump_addr != 0) { dtrace_invop_jump_addr(frame); break; } #endif #ifdef KDB kdb_trap(exception, 0, frame); #else dump_regs(frame); panic("No debugger in kernel.\n"); #endif break; case EXCP_ILLEGAL_INSTRUCTION: dump_regs(frame); panic("Illegal instruction at 0x%016lx\n", frame->tf_sepc); break; default: dump_regs(frame); panic("Unknown kernel exception %x badaddr %lx\n", exception, frame->tf_sbadaddr); } } void do_trap_user(struct trapframe *frame) { uint64_t exception; struct thread *td; uint64_t sstatus; struct pcb *pcb; td = curthread; td->td_frame = frame; pcb = td->td_pcb; /* Ensure we came from usermode, interrupts disabled */ __asm __volatile("csrr %0, sstatus" : "=&r" (sstatus)); KASSERT((sstatus & (SSTATUS_SPP | SSTATUS_SIE)) == 0, ("We must came from U mode with interrupts disabled")); exception = (frame->tf_scause & EXCP_MASK); if (frame->tf_scause & EXCP_INTR) { /* Interrupt */ riscv_cpu_intr(frame); return; } CTR3(KTR_TRAP, "do_trap_user: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch(exception) { case EXCP_FAULT_LOAD: case EXCP_FAULT_STORE: case EXCP_FAULT_FETCH: data_abort(frame, 1); break; case EXCP_USER_ECALL: frame->tf_sepc += 4; /* Next instruction */ svc_handler(frame); break; case EXCP_ILLEGAL_INSTRUCTION: #ifdef FPE if ((pcb->pcb_fpflags & PCB_FP_STARTED) == 0) { /* * May be a FPE trap. Enable FPE usage * for this thread and try again. */ frame->tf_sstatus |= SSTATUS_FS_INITIAL; pcb->pcb_fpflags |= PCB_FP_STARTED; break; } #endif call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)frame->tf_sepc); userret(td, frame); break; case EXCP_BREAKPOINT: call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_sepc); userret(td, frame); break; default: dump_regs(frame); panic("Unknown userland exception %x badaddr %lx\n", exception, frame->tf_sbadaddr); } } Index: head/sys/sparc64/sparc64/trap.c =================================================================== --- head/sys/sparc64/sparc64/trap.c (revision 319872) +++ head/sys/sparc64/sparc64/trap.c (revision 319873) @@ -1,617 +1,618 @@ /*- * Copyright (c) 2001, Jake Burkholder * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 * from: FreeBSD: src/sys/i386/i386/trap.c,v 1.197 2001/07/19 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktr.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void trap(struct trapframe *tf); void syscall(struct trapframe *tf); static int trap_cecc(void); static int trap_pfault(struct thread *td, struct trapframe *tf); extern char copy_fault[]; extern char copy_nofault_begin[]; extern char copy_nofault_end[]; extern char fs_fault[]; extern char fs_nofault_begin[]; extern char fs_nofault_end[]; extern char fs_nofault_intr_begin[]; extern char fs_nofault_intr_end[]; extern char fas_fault[]; extern char fas_nofault_begin[]; extern char fas_nofault_end[]; const char *const trap_msg[] = { "reserved", "instruction access exception", "instruction access error", "instruction access protection", "illtrap instruction", "illegal instruction", "privileged opcode", "floating point disabled", "floating point exception ieee 754", "floating point exception other", "tag overflow", "division by zero", "data access exception", "data access error", "data access protection", "memory address not aligned", "privileged action", "async data error", "trap instruction 16", "trap instruction 17", "trap instruction 18", "trap instruction 19", "trap instruction 20", "trap instruction 21", "trap instruction 22", "trap instruction 23", "trap instruction 24", "trap instruction 25", "trap instruction 26", "trap instruction 27", "trap instruction 28", "trap instruction 29", "trap instruction 30", "trap instruction 31", "fast instruction access mmu miss", "fast data access mmu miss", "interrupt", "physical address watchpoint", "virtual address watchpoint", "corrected ecc error", "spill", "fill", "fill", "breakpoint", "clean window", "range check", "fix alignment", "integer overflow", "syscall", "restore physical watchpoint", "restore virtual watchpoint", "kernel stack fault", }; static const int trap_sig[] = { SIGILL, /* reserved */ SIGILL, /* instruction access exception */ SIGILL, /* instruction access error */ SIGILL, /* instruction access protection */ SIGILL, /* illtrap instruction */ SIGILL, /* illegal instruction */ SIGBUS, /* privileged opcode */ SIGFPE, /* floating point disabled */ SIGFPE, /* floating point exception ieee 754 */ SIGFPE, /* floating point exception other */ SIGEMT, /* tag overflow */ SIGFPE, /* division by zero */ SIGILL, /* data access exception */ SIGILL, /* data access error */ SIGBUS, /* data access protection */ SIGBUS, /* memory address not aligned */ SIGBUS, /* privileged action */ SIGBUS, /* async data error */ SIGILL, /* trap instruction 16 */ SIGILL, /* trap instruction 17 */ SIGILL, /* trap instruction 18 */ SIGILL, /* trap instruction 19 */ SIGILL, /* trap instruction 20 */ SIGILL, /* trap instruction 21 */ SIGILL, /* trap instruction 22 */ SIGILL, /* trap instruction 23 */ SIGILL, /* trap instruction 24 */ SIGILL, /* trap instruction 25 */ SIGILL, /* trap instruction 26 */ SIGILL, /* trap instruction 27 */ SIGILL, /* trap instruction 28 */ SIGILL, /* trap instruction 29 */ SIGILL, /* trap instruction 30 */ SIGILL, /* trap instruction 31 */ SIGSEGV, /* fast instruction access mmu miss */ SIGSEGV, /* fast data access mmu miss */ -1, /* interrupt */ -1, /* physical address watchpoint */ -1, /* virtual address watchpoint */ -1, /* corrected ecc error */ SIGILL, /* spill */ SIGILL, /* fill */ SIGILL, /* fill */ SIGTRAP, /* breakpoint */ SIGILL, /* clean window */ SIGILL, /* range check */ SIGILL, /* fix alignment */ SIGILL, /* integer overflow */ SIGSYS, /* syscall */ -1, /* restore physical watchpoint */ -1, /* restore virtual watchpoint */ -1, /* kernel stack fault */ }; CTASSERT(nitems(trap_msg) == T_MAX); CTASSERT(nitems(trap_sig) == T_MAX); CTASSERT(sizeof(struct trapframe) == 256); int debugger_on_signal = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_signal, CTLFLAG_RW, &debugger_on_signal, 0, ""); u_int corrected_ecc = 0; SYSCTL_UINT(_machdep, OID_AUTO, corrected_ecc, CTLFLAG_RD, &corrected_ecc, 0, "corrected ECC errors"); /* * SUNW,set-trap-table allows to take over %tba from the PROM, which * will turn off interrupts and handle outstanding ones while doing so, * in a safe way. */ void sun4u_set_traptable(void *tba_addr) { static struct { cell_t name; cell_t nargs; cell_t nreturns; cell_t tba_addr; } args = { (cell_t)"SUNW,set-trap-table", 1, 0, }; args.tba_addr = (cell_t)tba_addr; ofw_entry(&args); } void trap(struct trapframe *tf) { struct thread *td; struct proc *p; int error; int sig; register_t addr; ksiginfo_t ksi; td = curthread; CTR4(KTR_TRAP, "trap: %p type=%s (%s) pil=%#lx", td, trap_msg[tf->tf_type & ~T_KERNEL], (TRAPF_USERMODE(tf) ? "user" : "kernel"), rdpr(pil)); VM_CNT_INC(v_trap); if ((tf->tf_tstate & TSTATE_PRIV) == 0) { KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); p = td->td_proc; td->td_pticks = 0; td->td_frame = tf; addr = tf->tf_tpc; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (tf->tf_type) { case T_DATA_MISS: case T_DATA_PROTECTION: addr = tf->tf_sfar; /* FALLTHROUGH */ case T_INSTRUCTION_MISS: sig = trap_pfault(td, tf); break; case T_FILL: sig = rwindow_load(td, tf, 2); break; case T_FILL_RET: sig = rwindow_load(td, tf, 1); break; case T_SPILL: sig = rwindow_save(td); break; case T_CORRECTED_ECC_ERROR: sig = trap_cecc(); break; default: if (tf->tf_type > T_MAX) panic("trap: bad trap type %#lx (user)", tf->tf_type); else if (trap_sig[tf->tf_type] == -1) panic("trap: %s (user)", trap_msg[tf->tf_type]); sig = trap_sig[tf->tf_type]; break; } if (sig != 0) { /* Translate fault for emulators. */ if (p->p_sysent->sv_transtrap != NULL) { sig = p->p_sysent->sv_transtrap(sig, tf->tf_type); } if (debugger_on_signal && (sig == 4 || sig == 10 || sig == 11)) kdb_enter(KDB_WHY_TRAPSIG, "trapsig"); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int)tf->tf_type; /* XXX not POSIX */ ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = (int)tf->tf_type; trapsignal(td, &ksi); } userret(td, tf); } else { KASSERT((tf->tf_type & T_KERNEL) != 0, ("trap: kernel trap isn't")); if (kdb_active) { kdb_reenter(); return; } switch (tf->tf_type & ~T_KERNEL) { case T_BREAKPOINT: case T_KSTACK_FAULT: error = (kdb_trap(tf->tf_type, 0, tf) == 0); TF_DONE(tf); break; #ifdef notyet case T_PA_WATCHPOINT: case T_VA_WATCHPOINT: error = db_watch_trap(tf); break; #endif case T_DATA_MISS: case T_DATA_PROTECTION: case T_INSTRUCTION_MISS: error = trap_pfault(td, tf); break; case T_DATA_EXCEPTION: case T_MEM_ADDRESS_NOT_ALIGNED: if ((tf->tf_sfsr & MMU_SFSR_FV) != 0 && MMU_SFSR_GET_ASI(tf->tf_sfsr) == ASI_AIUP) { if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } } error = 1; break; case T_DATA_ERROR: /* * Handle PCI poke/peek as per UltraSPARC IIi * User's Manual 16.2.1, modulo checking the * TPC as USIII CPUs generate a precise trap * instead of a special deferred one. */ if (tf->tf_tpc > (u_long)fas_nofault_begin && tf->tf_tpc < (u_long)fas_nofault_end) { cache_flush(); cache_enable(PCPU_GET(impl)); tf->tf_tpc = (u_long)fas_fault; tf->tf_tnpc = tf->tf_tpc + 4; error = 0; break; } error = 1; break; case T_CORRECTED_ECC_ERROR: error = trap_cecc(); break; default: error = 1; break; } if (error != 0) { tf->tf_type &= ~T_KERNEL; if (tf->tf_type > T_MAX) panic("trap: bad trap type %#lx (kernel)", tf->tf_type); panic("trap: %s (kernel)", trap_msg[tf->tf_type]); } } CTR1(KTR_TRAP, "trap: td=%p return", td); } static int trap_cecc(void) { u_long eee; /* * Turn off (non-)correctable error reporting while we're dealing * with the error. */ eee = ldxa(0, ASI_ESTATE_ERROR_EN_REG); stxa_sync(0, ASI_ESTATE_ERROR_EN_REG, eee & ~(AA_ESTATE_NCEEN | AA_ESTATE_CEEN)); /* Flush the caches in order ensure no corrupt data got installed. */ cache_flush(); /* Ensure the caches are still turned on (should be). */ cache_enable(PCPU_GET(impl)); /* Clear the error from the AFSR. */ stxa_sync(0, ASI_AFSR, ldxa(0, ASI_AFSR)); corrected_ecc++; printf("corrected ECC error\n"); /* Turn (non-)correctable error reporting back on. */ stxa_sync(0, ASI_ESTATE_ERROR_EN_REG, eee); return (0); } static int trap_pfault(struct thread *td, struct trapframe *tf) { vm_map_t map; struct proc *p; vm_offset_t va; vm_prot_t prot; vm_map_entry_t entry; u_long ctx; int type; int rv; if (td == NULL) return (-1); KASSERT(td->td_pcb != NULL, ("trap_pfault: pcb NULL")); KASSERT(td->td_proc != NULL, ("trap_pfault: curproc NULL")); KASSERT(td->td_proc->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); p = td->td_proc; rv = KERN_SUCCESS; ctx = TLB_TAR_CTX(tf->tf_tar); type = tf->tf_type & ~T_KERNEL; va = TLB_TAR_VA(tf->tf_tar); CTR4(KTR_TRAP, "trap_pfault: td=%p pm_ctx=%#lx va=%#lx ctx=%#lx", td, p->p_vmspace->vm_pmap.pm_context[curcpu], va, ctx); if (type == T_DATA_PROTECTION) prot = VM_PROT_WRITE; else { if (type == T_DATA_MISS) prot = VM_PROT_READ; else prot = VM_PROT_READ | VM_PROT_EXECUTE; } if (ctx != TLB_CTX_KERNEL) { if ((tf->tf_tstate & TSTATE_PRIV) != 0 && (tf->tf_tpc >= (u_long)fs_nofault_intr_begin && tf->tf_tpc <= (u_long)fs_nofault_intr_end)) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } /* This is a fault on non-kernel virtual memory. */ map = &p->p_vmspace->vm_map; } else { /* * This is a fault on kernel virtual memory. Attempts to * access kernel memory from user mode cause privileged * action traps, not page fault. */ KASSERT(tf->tf_tstate & TSTATE_PRIV, ("trap_pfault: fault on nucleus context from user mode")); if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { vm_map_lock_read(kernel_map); if (vm_map_lookup_entry(kernel_map, va, &entry) && (entry->eflags & MAP_ENTRY_NOFAULT) != 0) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; vm_map_unlock_read(kernel_map); return (0); } vm_map_unlock_read(kernel_map); } map = kernel_map; } /* Fault in the page. */ rv = vm_fault(map, va, prot, VM_FAULT_NORMAL); CTR3(KTR_TRAP, "trap_pfault: return td=%p va=%#lx rv=%d", td, va, rv); if (rv == KERN_SUCCESS) return (0); if (ctx != TLB_CTX_KERNEL && (tf->tf_tstate & TSTATE_PRIV) != 0) { if (tf->tf_tpc >= (u_long)fs_nofault_begin && tf->tf_tpc <= (u_long)fs_nofault_end) { tf->tf_tpc = (u_long)fs_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } if (tf->tf_tpc >= (u_long)copy_nofault_begin && tf->tf_tpc <= (u_long)copy_nofault_end) { tf->tf_tpc = (u_long)copy_fault; tf->tf_tnpc = tf->tf_tpc + 4; return (0); } } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } /* Maximum number of arguments that can be passed via the out registers. */ #define REG_MAXARGS 6 int -cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) +cpu_fetch_syscall_args(struct thread *td) { struct trapframe *tf; struct proc *p; register_t *argp; + struct syscall_args *sa; int reg; int regcnt; int error; p = td->td_proc; tf = td->td_frame; + sa = &td->td_sa; reg = 0; regcnt = REG_MAXARGS; sa->code = tf->tf_global[1]; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = tf->tf_out[reg++]; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = sa->args; bcopy(&tf->tf_out[reg], sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) error = copyin((void *)(tf->tf_out[6] + SPOFF + offsetof(struct frame, fr_pad[6])), &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; } return (error); } #include "../../kern/subr_syscall.c" /* * Syscall handler * The arguments to the syscall are passed in the out registers by the caller, * and are saved in the trap frame. The syscall number is passed in %g1 (and * also saved in the trap frame). */ void syscall(struct trapframe *tf) { struct thread *td; - struct syscall_args sa; int error; td = curthread; td->td_frame = tf; KASSERT(td != NULL, ("trap: curthread NULL")); KASSERT(td->td_proc != NULL, ("trap: curproc NULL")); /* * For syscalls, we don't want to retry the faulting instruction * (usually), instead we need to advance one instruction. */ td->td_pcb->pcb_tpc = tf->tf_tpc; TF_DONE(tf); - error = syscallenter(td, &sa); - syscallret(td, error, &sa); + error = syscallenter(td); + syscallret(td, error); } Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 319872) +++ head/sys/sys/proc.h (revision 319873) @@ -1,1137 +1,1138 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt + * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ - u_int td_dbg_sc_code; /* (c) Syscall code to debugger. */ - u_int td_dbg_sc_narg; /* (c) Syscall arg count to debugger.*/ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ + struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on + fork for child tracing. */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) ((td)->td_locks--) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ u_int p_ptevents; /* (c) ptrace() event mask. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint16_t p_elf_machine; /* (x) ELF machine type */ uint64_t p_elf_flags; /* (x) ELF flags */ /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_UNUSED3 0x00008 /* --available-- */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeothergids(struct ucred *u1, struct ucred *u2); int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); int cr_canseejailproc(struct ucred *u1, struct ucred *u2); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); void proc_set_traced(struct proc *p, bool stop); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); -int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa); +int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } extern void (*softdep_ast_cleanup)(struct thread *); static __inline void td_softdep_cleanup(struct thread *td) { if (td->td_su != NULL && softdep_ast_cleanup != NULL) softdep_ast_cleanup(td); } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/sysent.h =================================================================== --- head/sys/sys/sysent.h (revision 319872) +++ head/sys/sys/sysent.h (revision 319873) @@ -1,287 +1,286 @@ /*- * Copyright (c) 1982, 1988, 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SYSENT_H_ #define _SYS_SYSENT_H_ #include struct rlimit; struct sysent; struct thread; struct ksiginfo; struct syscall_args; enum systrace_probe_t { SYSTRACE_ENTRY, SYSTRACE_RETURN, }; typedef int sy_call_t(struct thread *, void *); typedef void (*systrace_probe_func_t)(struct syscall_args *, enum systrace_probe_t, int); typedef void (*systrace_args_func_t)(int, void *, uint64_t *, int *); extern systrace_probe_func_t systrace_probe_func; struct sysent { /* system call table */ int sy_narg; /* number of arguments */ sy_call_t *sy_call; /* implementing function */ au_event_t sy_auevent; /* audit event associated with syscall */ systrace_args_func_t sy_systrace_args_func; /* optional argument conversion function. */ u_int32_t sy_entry; /* DTrace entry ID for systrace. */ u_int32_t sy_return; /* DTrace return ID for systrace. */ u_int32_t sy_flags; /* General flags for system calls. */ u_int32_t sy_thrcnt; }; /* * A system call is permitted in capability mode. */ #define SYF_CAPENABLED 0x00000001 #define SY_THR_FLAGMASK 0x7 #define SY_THR_STATIC 0x1 #define SY_THR_DRAINING 0x2 #define SY_THR_ABSENT 0x4 #define SY_THR_INCR 0x8 #ifdef KLD_MODULE #define SY_THR_STATIC_KLD 0 #else #define SY_THR_STATIC_KLD SY_THR_STATIC #endif struct image_params; struct __sigset; struct trapframe; struct vnode; struct sysentvec { int sv_size; /* number of entries */ struct sysent *sv_table; /* pointer to sysent */ u_int sv_mask; /* optional mask to index */ int sv_errsize; /* size of errno translation table */ int *sv_errtbl; /* errno translation table */ int (*sv_transtrap)(int, int); /* translate trap-to-signal mapping */ int (*sv_fixup)(register_t **, struct image_params *); /* stack fixup function */ void (*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *); /* send signal */ char *sv_sigcode; /* start of sigtramp code */ int *sv_szsigcode; /* size of sigtramp code */ char *sv_name; /* name of binary type */ int (*sv_coredump)(struct thread *, struct vnode *, off_t, int); /* function to dump core, or NULL */ int (*sv_imgact_try)(struct image_params *); int sv_minsigstksz; /* minimum signal stack size */ int sv_pagesize; /* pagesize */ vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */ vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */ vm_offset_t sv_usrstack; /* USRSTACK */ vm_offset_t sv_psstrings; /* PS_STRINGS */ int sv_stackprot; /* vm protection for stack */ register_t *(*sv_copyout_strings)(struct image_params *); void (*sv_setregs)(struct thread *, struct image_params *, u_long); void (*sv_fixlimit)(struct rlimit *, int); u_long *sv_maxssiz; u_int sv_flags; void (*sv_set_syscall_retval)(struct thread *, int); - int (*sv_fetch_syscall_args)(struct thread *, struct - syscall_args *); + int (*sv_fetch_syscall_args)(struct thread *); const char **sv_syscallnames; vm_offset_t sv_timekeep_base; vm_offset_t sv_shared_page_base; vm_offset_t sv_shared_page_len; vm_offset_t sv_sigcode_base; void *sv_shared_page_obj; void (*sv_schedtail)(struct thread *); void (*sv_thread_detach)(struct thread *); int (*sv_trap)(struct thread *); }; #define SV_ILP32 0x000100 /* 32-bit executable. */ #define SV_LP64 0x000200 /* 64-bit executable. */ #define SV_IA32 0x004000 /* Intel 32-bit executable. */ #define SV_AOUT 0x008000 /* a.out executable. */ #define SV_SHP 0x010000 /* Shared page. */ #define SV_CAPSICUM 0x020000 /* Force cap_enter() on startup. */ #define SV_TIMEKEEP 0x040000 /* Shared page timehands. */ #define SV_ABI_MASK 0xff #define SV_ABI_ERRNO(p, e) ((p)->p_sysent->sv_errsize <= 0 ? e : \ ((e) >= (p)->p_sysent->sv_errsize ? -1 : (p)->p_sysent->sv_errtbl[e])) #define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x)) #define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK) #define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x) #define SV_CURPROC_ABI() SV_PROC_ABI(curproc) /* same as ELFOSABI_XXX, to prevent header pollution */ #define SV_ABI_LINUX 3 #define SV_ABI_FREEBSD 9 #define SV_ABI_CLOUDABI 17 #define SV_ABI_UNDEF 255 #ifdef _KERNEL extern struct sysentvec aout_sysvec; extern struct sysent sysent[]; extern const char *syscallnames[]; #if defined(__amd64__) extern int i386_read_exec; #endif #define NO_SYSCALL (-1) struct module; struct syscall_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ int *offset; /* offset into sysent */ struct sysent *new_sysent; /* new sysent */ struct sysent old_sysent; /* old sysent */ int flags; /* flags for syscall_register */ }; /* separate initialization vector so it can be used in a substructure */ #define SYSENT_INIT_VALS(_syscallname) { \ .sy_narg = (sizeof(struct _syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)&sys_##_syscallname, \ .sy_auevent = SYS_AUE_##_syscallname, \ .sy_systrace_args_func = NULL, \ .sy_entry = 0, \ .sy_return = 0, \ .sy_flags = 0, \ .sy_thrcnt = 0 \ } #define MAKE_SYSENT(syscallname) \ static struct sysent syscallname##_sysent = SYSENT_INIT_VALS(syscallname); #define MAKE_SYSENT_COMPAT(syscallname) \ static struct sysent syscallname##_sysent = { \ (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ (sy_call_t *)& syscallname, \ SYS_AUE_##syscallname \ } #define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \ static struct syscall_module_data name##_syscall_mod = { \ evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \ }; \ \ static moduledata_t name##_mod = { \ "sys/" #name, \ syscall_module_handler, \ &name##_syscall_mod \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE) #define SYSCALL_MODULE_HELPER(syscallname) \ static int syscallname##_syscall = SYS_##syscallname; \ MAKE_SYSENT(syscallname); \ SYSCALL_MODULE(syscallname, \ & syscallname##_syscall, & syscallname##_sysent, \ NULL, NULL) #define SYSCALL_MODULE_PRESENT(syscallname) \ (sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \ sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys) /* * Syscall registration helpers with resource allocation handling. */ struct syscall_helper_data { struct sysent new_sysent; struct sysent old_sysent; int syscall_no; int registered; }; #define SYSCALL_INIT_HELPER(syscallname) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& sys_ ## syscallname, \ .sy_auevent = SYS_AUE_##syscallname \ }, \ .syscall_no = SYS_##syscallname \ } #define SYSCALL_INIT_HELPER_COMPAT(syscallname) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& syscallname, \ .sy_auevent = SYS_AUE_##syscallname \ }, \ .syscall_no = SYS_##syscallname \ } #define SYSCALL_INIT_LAST { \ .syscall_no = NO_SYSCALL \ } int syscall_register(int *offset, struct sysent *new_sysent, struct sysent *old_sysent, int flags); int syscall_deregister(int *offset, struct sysent *old_sysent); int syscall_module_handler(struct module *mod, int what, void *arg); int syscall_helper_register(struct syscall_helper_data *sd, int flags); int syscall_helper_unregister(struct syscall_helper_data *sd); struct proc; const char *syscallname(struct proc *p, u_int code); /* Special purpose system call functions. */ struct nosys_args; int lkmnosys(struct thread *, struct nosys_args *); int lkmressys(struct thread *, struct nosys_args *); int syscall_thread_enter(struct thread *td, struct sysent *se); void syscall_thread_exit(struct thread *td, struct sysent *se); int shared_page_alloc(int size, int align); int shared_page_fill(int size, int align, const void *data); void shared_page_write(int base, int size, const void *data); void exec_sysvec_init(void *param); void exec_inittk(void); #define INIT_SYSENTVEC(name, sv) \ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ (sysinit_cfunc_t)exec_sysvec_init, sv); #endif /* _KERNEL */ #endif /* !_SYS_SYSENT_H_ */