Index: head/sys/amd64/linux32/linux32_sysvec.c =================================================================== --- head/sys/amd64/linux32/linux32_sysvec.c (revision 283421) +++ head/sys/amd64/linux32/linux32_sysvec.c (revision 283422) @@ -1,1221 +1,1208 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define AUXARGS_ENTRY_32(pos, id, val) \ do { \ suword32(pos++, id); \ suword32(pos++, val); \ } while (0) #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 #define LINUX_SYS_linux_sendsig 0 const char *linux_kplatform; static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux32_locore_o_start; extern char _binary_linux32_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static register_t *linux_copyout_strings(struct image_params *imgp); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static void linux32_fixlimit(struct rlimit *rl, int which); static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); -static eventhandler_tag linux_exit_tag; -static eventhandler_tag linux_exec_tag; -static eventhandler_tag linux_thread_dtor_tag; - /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 0, LINUX_SIGUSR1, LINUX_SIGUSR2 }; int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGTRAP, SIGABRT, SIGBUS, SIGFPE, SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, SIGPIPE, SIGALRM, SIGTERM, SIGBUS, SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, SIGIO, SIGURG, SIGSYS }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); base = (Elf32_Addr *)*stack_base; args = (Elf32_Auxargs *)imgp->auxargs; pos = base + (imgp->args->argc + imgp->args->envc + 2); AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall); AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY_32(pos, AT_BASE, args->base); AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0); AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); if (args->execfd != -1) AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY_32(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword32(base, (uint32_t)imgp->args->argc); *stack_base = (register_t *)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn * and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack, i; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) frame.sf_extramask[i] = lmask.__bits[i+1]; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags, i; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__bits[0] = frame.sf_sc.sc_mask; for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) lmask.__bits[i+1] = frame.sf_extramask[i]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; p = td->td_proc; frame = td->td_frame; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an * alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = imgp->ps_strings; fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); td->td_retval[1] = 0; } /* * XXX copied from ia32_sysvec.c. */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; u_int32_t *vectp; char *stringp, *destp; u_int32_t *stack_base; struct linux32_ps_strings *arginfo; /* * Calculate string base and vector table pointers. */ arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (caddr_t)arginfo - SPARE_USRSPACE - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(u_int32_t)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (u_int32_t *)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); /* * vectp also becomes our initial stack base */ stack_base = vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword32(vectp++, 0); suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword32(vectp, 0); return ((register_t *)stack_base); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); #if defined(DEBUG) SYSCTL_PROC(_compat_linux32, OID_AUTO, debug, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, linux_sysctl_debug, "A", "Linux debugging control"); #endif static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_sigsize = LINUX_SIGTBLSZ, .sv_sigtbl = bsd_to_linux_signal, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_prepsyscall = NULL, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux32_locore_o_end - &_binary_linux32_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; linux_kplatform = linux_shared_page_mapping + (linux_platform - (caddr_t)LINUX32_SHAREDPAGE); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t)linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj); }; SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); - linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, - linux_proc_exit, NULL, 1000); - linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, - linux_proc_exec, NULL, 1000); - linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, - linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); - EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); - EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); - EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); Index: head/sys/compat/linux/linux_common.c =================================================================== --- head/sys/compat/linux/linux_common.c (revision 283421) +++ head/sys/compat/linux/linux_common.c (revision 283422) @@ -1,74 +1,89 @@ /*- * Copyright (c) 2014 Vassilis Laganakos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include -#include -#include -#include -#include +#include +#include +#include +#include #include -#include +#include +#include +#include #include #include MODULE_VERSION(linux_common, 1); SET_DECLARE(linux_device_handler_set, struct linux_device_handler); +static eventhandler_tag linux_exec_tag; +static eventhandler_tag linux_thread_dtor_tag; +static eventhandler_tag linux_exit_tag; + static int linux_common_modevent(module_t mod, int type, void *data) { struct linux_device_handler **ldhp; switch(type) { case MOD_LOAD: linux_osd_jail_register(); + linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, + linux_proc_exit, NULL, 1000); + linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, + linux_proc_exec, NULL, 1000); + linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, + linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); break; case MOD_UNLOAD: linux_osd_jail_deregister(); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); + EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); + EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); + EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t linux_common_mod = { "linuxcommon", linux_common_modevent, 0 }; DECLARE_MODULE(linuxcommon, linux_common_mod, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/compat/linux/linux_emul.c =================================================================== --- head/sys/compat/linux/linux_emul.c (revision 283421) +++ head/sys/compat/linux/linux_emul.c (revision 283422) @@ -1,357 +1,243 @@ /*- * Copyright (c) 2006 Roman Divacky * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); -#include "opt_compat.h" - #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include -#ifdef COMPAT_LINUX32 -#include -#include -#else -#include -#include -#endif - -#include #include -#include #include #include -/** - * Special DTrace provider for the linuxulator. - * - * In this file we define the provider for the entire linuxulator. All - * modules (= files of the linuxulator) use it. - * - * We define a different name depending on the emulated bitsize, see - * ../..//linux{,32}/linux.h, e.g.: - * native bitsize = linuxulator - * amd64, 32bit emulation = linuxulator32 - */ -LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE); -/** - * DTrace probes in this module. - */ -LIN_SDT_PROBE_DEFINE1(emul, em_find, entry, "struct thread *"); -LIN_SDT_PROBE_DEFINE0(emul, em_find, return); -LIN_SDT_PROBE_DEFINE3(emul, proc_init, entry, "struct thread *", - "struct thread *", "int"); -LIN_SDT_PROBE_DEFINE0(emul, proc_init, create_thread); -LIN_SDT_PROBE_DEFINE0(emul, proc_init, fork); -LIN_SDT_PROBE_DEFINE0(emul, proc_init, exec); -LIN_SDT_PROBE_DEFINE0(emul, proc_init, return); -LIN_SDT_PROBE_DEFINE1(emul, proc_exit, entry, "struct proc *"); -LIN_SDT_PROBE_DEFINE1(emul, linux_thread_detach, entry, "struct thread *"); -LIN_SDT_PROBE_DEFINE0(emul, linux_thread_detach, futex_failed); -LIN_SDT_PROBE_DEFINE1(emul, linux_thread_detach, child_clear_tid_error, "int"); -LIN_SDT_PROBE_DEFINE0(emul, linux_thread_detach, return); -LIN_SDT_PROBE_DEFINE2(emul, proc_exec, entry, "struct proc *", - "struct image_params *"); -LIN_SDT_PROBE_DEFINE0(emul, proc_exec, return); -LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, entry); -LIN_SDT_PROBE_DEFINE1(emul, linux_schedtail, copyout_error, "int"); -LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, return); -LIN_SDT_PROBE_DEFINE1(emul, linux_set_tid_address, entry, "int *"); -LIN_SDT_PROBE_DEFINE0(emul, linux_set_tid_address, return); - /* - * This returns reference to the emuldata entry (if found) + * This returns reference to the thread emuldata entry (if found) * * Hold PROC_LOCK when referencing emuldata from other threads. */ struct linux_emuldata * em_find(struct thread *td) { struct linux_emuldata *em; - LIN_SDT_PROBE1(emul, em_find, entry, td); - em = td->td_emuldata; - LIN_SDT_PROBE1(emul, em_find, return, em); - return (em); } +/* + * This returns reference to the proc pemuldata entry (if found) + * + * Hold PROC_LOCK when referencing proc pemuldata from other threads. + * Hold LINUX_PEM_LOCK wher referencing pemuldata members. + */ +struct linux_pemuldata * +pem_find(struct proc *p) +{ + struct linux_pemuldata *pem; + + pem = p->p_emuldata; + + return (pem); +} + void linux_proc_init(struct thread *td, struct thread *newtd, int flags) { struct linux_emuldata *em; + struct linux_pemuldata *pem; - LIN_SDT_PROBE3(emul, proc_init, entry, td, newtd, flags); - if (newtd != NULL) { /* non-exec call */ em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO); em->pdeath_signal = 0; - em->flags = 0; em->robust_futexes = NULL; if (flags & LINUX_CLONE_THREAD) { - LIN_SDT_PROBE0(emul, proc_init, create_thread); - em->em_tid = newtd->td_tid; } else { - LIN_SDT_PROBE0(emul, proc_init, fork); em->em_tid = newtd->td_proc->p_pid; + + pem = malloc(sizeof(*pem), M_TEMP, M_WAITOK | M_ZERO); + sx_init(&pem->pem_sx, "lpemlk"); + newtd->td_proc->p_emuldata = pem; } newtd->td_emuldata = em; } else { /* exec */ - LIN_SDT_PROBE0(emul, proc_init, exec); /* lookup the old one */ em = em_find(td); KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n")); em->em_tid = td->td_proc->p_pid; } em->child_clear_tid = NULL; em->child_set_tid = NULL; - - LIN_SDT_PROBE0(emul, proc_init, return); } void linux_proc_exit(void *arg __unused, struct proc *p) { + struct linux_pemuldata *pem; struct thread *td = curthread; - if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX)) { - LIN_SDT_PROBE1(emul, proc_exit, entry, p); - (p->p_sysent->sv_thread_detach)(td); - } + if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX)) + return; + + pem = pem_find(p); + if (pem == NULL) + return; + (p->p_sysent->sv_thread_detach)(td); + + p->p_emuldata = NULL; + + sx_destroy(&pem->pem_sx); + free(pem, M_TEMP); } int linux_common_execve(struct thread *td, struct image_args *eargs) { + struct linux_pemuldata *pem; struct linux_emuldata *em; struct proc *p; int error; p = td->td_proc; /* * Unlike FreeBSD abort all other threads before * proceeding exec. */ PROC_LOCK(p); /* See exit1() comments. */ thread_suspend_check(0); while (p->p_flag & P_HADTHREADS) { if (!thread_single(p, SINGLE_EXIT)) break; thread_suspend_check(0); } PROC_UNLOCK(p); error = kern_execve(td, eargs, NULL); if (error != 0) return (error); /* * In a case of transition from Linux binary execing to - * FreeBSD binary we destroy linux emuldata thread entry. + * FreeBSD binary we destroy linux emuldata thread & proc entries. */ if (SV_CURPROC_ABI() != SV_ABI_LINUX) { PROC_LOCK(p); em = em_find(td); - KASSERT(em != NULL, ("proc_exec: emuldata not found.\n")); + KASSERT(em != NULL, ("proc_exec: thread emuldata not found.\n")); td->td_emuldata = NULL; + + pem = pem_find(p); + KASSERT(pem != NULL, ("proc_exec: proc pemuldata not found.\n")); + p->p_emuldata = NULL; PROC_UNLOCK(p); free(em, M_TEMP); + free(pem, M_TEMP); } return (0); } void linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) { struct thread *td = curthread; /* * In a case of execing to linux binary we create linux * emuldata thread entry. */ if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX)) { - LIN_SDT_PROBE2(emul, proc_exec, entry, p, imgp); if (SV_PROC_ABI(p) == SV_ABI_LINUX) linux_proc_init(td, NULL, 0); else linux_proc_init(td, td, 0); - - LIN_SDT_PROBE0(emul, proc_exec, return); } } void -linux_thread_detach(struct thread *td) -{ - struct linux_sys_futex_args cup; - struct linux_emuldata *em; - int *child_clear_tid; - int null = 0; - int error; - - LIN_SDT_PROBE1(emul, linux_thread_detach, entry, td); - - em = em_find(td); - KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); - - LINUX_CTR1(exit, "thread detach(%d)", em->em_tid); - - release_futexes(td, em); - - child_clear_tid = em->child_clear_tid; - - if (child_clear_tid != NULL) { - - LINUX_CTR2(exit, "thread detach(%d) %p", - em->em_tid, child_clear_tid); - - error = copyout(&null, child_clear_tid, sizeof(null)); - if (error) { - LIN_SDT_PROBE1(emul, linux_thread_detach, - child_clear_tid_error, error); - - LIN_SDT_PROBE0(emul, linux_thread_detach, return); - return; - } - - cup.uaddr = child_clear_tid; - cup.op = LINUX_FUTEX_WAKE; - cup.val = 1; /* wake one */ - cup.timeout = NULL; - cup.uaddr2 = NULL; - cup.val3 = 0; - error = linux_sys_futex(td, &cup); - /* - * this cannot happen at the moment and if this happens it - * probably means there is a user space bug - */ - if (error) { - LIN_SDT_PROBE0(emul, linux_thread_detach, futex_failed); - printf(LMSG("futex stuff in thread_detach failed.\n")); - } - } - - LIN_SDT_PROBE0(emul, linux_thread_detach, return); -} - -void linux_thread_dtor(void *arg __unused, struct thread *td) { struct linux_emuldata *em; em = em_find(td); if (em == NULL) return; td->td_emuldata = NULL; LINUX_CTR1(exit, "thread dtor(%d)", em->em_tid); free(em, M_TEMP); } void linux_schedtail(struct thread *td) { struct linux_emuldata *em; struct proc *p; int error = 0; int *child_set_tid; - LIN_SDT_PROBE1(emul, linux_schedtail, entry, td); - p = td->td_proc; em = em_find(td); - KASSERT(em != NULL, ("linux_schedtail: emuldata not found.\n")); + KASSERT(em != NULL, ("linux_schedtail: thread emuldata not found.\n")); child_set_tid = em->child_set_tid; if (child_set_tid != NULL) { error = copyout(&em->em_tid, (int *)child_set_tid, sizeof(em->em_tid)); LINUX_CTR4(clone, "schedtail(%d) %p stored %d error %d", td->td_tid, child_set_tid, em->em_tid, error); - - if (error != 0) { - LIN_SDT_PROBE1(emul, linux_schedtail, copyout_error, - error); - } } else LINUX_CTR1(clone, "schedtail(%d)", em->em_tid); - - LIN_SDT_PROBE0(emul, linux_schedtail, return); -} - -int -linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) -{ - struct linux_emuldata *em; - - LIN_SDT_PROBE1(emul, linux_set_tid_address, entry, args->tidptr); - - em = em_find(td); - KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); - - em->child_clear_tid = args->tidptr; - - td->td_retval[0] = em->em_tid; - - LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", - em->em_tid, args->tidptr, td->td_retval[0]); - - LIN_SDT_PROBE0(emul, linux_set_tid_address, return); - return (0); } Index: head/sys/compat/linux/linux_emul.h =================================================================== --- head/sys/compat/linux/linux_emul.h (revision 283421) +++ head/sys/compat/linux/linux_emul.h (revision 283422) @@ -1,64 +1,75 @@ /*- * Copyright (c) 2006 Roman Divacky * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_EMUL_H_ #define _LINUX_EMUL_H_ /* * modeled after similar structure in NetBSD * this will be extended as we need more functionality */ struct linux_emuldata { int *child_set_tid; /* in clone(): Child's TID to set on clone */ int *child_clear_tid;/* in clone(): Child's TID to clear on exit */ int pdeath_signal; /* parent death signal */ - int flags; /* different emuldata flags */ + int flags; /* thread emuldata flags */ int em_tid; /* thread id */ struct linux_robust_list_head *robust_futexes; }; struct linux_emuldata *em_find(struct thread *); -/* emuldata flags */ -#define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated - futex REQUEUE op*/ - void linux_proc_init(struct thread *, struct thread *, int); void linux_proc_exit(void *, struct proc *); void linux_schedtail(struct thread *); void linux_proc_exec(void *, struct proc *, struct image_params *); void linux_thread_dtor(void *arg __unused, struct thread *); void linux_thread_detach(struct thread *); int linux_common_execve(struct thread *, struct image_args *); + +/* process emuldata flags */ +#define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated + futex REQUEUE op*/ +struct linux_pemuldata { + uint32_t flags; /* process emuldata flags */ + struct sx pem_sx; /* lock for this struct */ +}; + +#define LINUX_PEM_XLOCK(p) sx_xlock(&(p)->pem_sx) +#define LINUX_PEM_XUNLOCK(p) sx_xunlock(&(p)->pem_sx) +#define LINUX_PEM_SLOCK(p) sx_slock(&(p)->pem_sx) +#define LINUX_PEM_SUNLOCK(p) sx_sunlock(&(p)->pem_sx) + +struct linux_pemuldata *pem_find(struct proc *); #endif /* !_LINUX_EMUL_H_ */ Index: head/sys/compat/linux/linux_fork.c =================================================================== --- head/sys/compat/linux/linux_fork.c (revision 283421) +++ head/sys/compat/linux/linux_fork.c (revision 283422) @@ -1,406 +1,467 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include +#include #include #include int linux_fork(struct thread *td, struct linux_fork_args *args) { int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(fork)) printf(ARGS(fork, "")); #endif if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2, NULL, 0)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); return (0); } int linux_vfork(struct thread *td, struct linux_vfork_args *args) { int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(vfork)) printf(ARGS(vfork, "")); #endif /* Exclude RFPPWAIT */ if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2, NULL, 0)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); PROC_LOCK(p2); p2->p_flag |= P_PPWAIT; PROC_UNLOCK(p2); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); return (0); } static int linux_clone_proc(struct thread *td, struct linux_clone_args *args) { int error, ff = RFPROC | RFSTOPPED; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { if (exit_signal <= LINUX_SIGTBLSZ) exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; /* * XXX: In Linux, sharing of fs info (chroot/cwd/umask) * and open files is independant. In FreeBSD, its in one * structure but in reality it does not cause any problems * because both of these flags are usually set together. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); error = fork1(td, ff, 0, &p2, NULL, 0); if (error) return (error); td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ linux_proc_init(td, td2, args->flags); em = em_find(td2); KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) printf(LMSG("copyout failed!")); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ linux_set_upcall_kse(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); #ifdef DEBUG if (ldebug(clone)) printf(LMSG("clone: successful rfork to %d, " "stack %p sig = %d"), (int)p2->p_pid, args->stack, exit_signal); #endif if (args->flags & LINUX_CLONE_VFORK) { PROC_LOCK(p2); p2->p_flag |= P_PPWAIT; PROC_UNLOCK(p2); } /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); td->td_retval[0] = p2->p_pid; if (args->flags & LINUX_CLONE_VFORK) { /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); } return (0); } static int linux_clone_thread(struct thread *td, struct linux_clone_args *args) { struct linux_emuldata *em; struct thread *newtd; struct proc *p; int error; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "thread: flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif LINUX_CTR4(clone, "thread(%d) flags %x ptid %p ctid %p", td->td_tid, (unsigned)args->flags, args->parent_tidptr, args->child_tidptr); if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); /* Threads should be created with own stack */ if (args->stack == NULL) return (EINVAL); p = td->td_proc; /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) return (error); cpu_set_upcall(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = p; newtd->td_ucred = crhold(td->td_ucred); /* create the emuldata */ linux_proc_init(td, newtd, args->flags); em = em_find(newtd); KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(newtd, args->tls); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; cpu_thread_clean(newtd); linux_set_upcall_kse(newtd, PTROUT(args->stack)); PROC_LOCK(p); p->p_flag |= P_HADTHREADS; newtd->td_sigmask = td->td_sigmask; bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); if (args->flags & LINUX_CLONE_PARENT) thread_link(newtd, p->p_pptr); else thread_link(newtd, p); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; PROC_UNLOCK(p); tidhash_add(newtd); #ifdef DEBUG if (ldebug(clone)) printf(ARGS(clone, "successful clone to %d, stack %p"), (int)newtd->td_tid, args->stack); #endif LINUX_CTR2(clone, "thread(%d) successful clone to %d", td->td_tid, newtd->td_tid); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&newtd->td_tid, args->parent_tidptr, sizeof(newtd->td_tid)); if (error) printf(LMSG("clone_thread: copyout failed!")); } /* * Make this runnable after we are finished with it. */ thread_lock(newtd); TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); td->td_retval[0] = newtd->td_tid; return (0); } int linux_clone(struct thread *td, struct linux_clone_args *args) { if (args->flags & LINUX_CLONE_THREAD) return (linux_clone_thread(td, args)); else return (linux_clone_proc(td, args)); } int linux_exit(struct thread *td, struct linux_exit_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("exit: emuldata not found.\n")); LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval); linux_thread_detach(td); /* * XXX. When the last two threads of a process * exit via pthread_exit() try thr_exit() first. */ kern_thr_exit(td); exit1(td, W_EXITCODE(args->rval, 0)); /* NOTREACHED */ +} + +int +linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) +{ + struct linux_emuldata *em; + + em = em_find(td); + KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); + + em->child_clear_tid = args->tidptr; + + td->td_retval[0] = em->em_tid; + + LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", + em->em_tid, args->tidptr, td->td_retval[0]); + + return (0); +} + +void +linux_thread_detach(struct thread *td) +{ + struct linux_sys_futex_args cup; + struct linux_emuldata *em; + int *child_clear_tid; + int error; + + em = em_find(td); + KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); + + LINUX_CTR1(exit, "thread detach(%d)", em->em_tid); + + release_futexes(td, em); + + child_clear_tid = em->child_clear_tid; + + if (child_clear_tid != NULL) { + + LINUX_CTR2(exit, "thread detach(%d) %p", + em->em_tid, child_clear_tid); + + error = suword32(child_clear_tid, 0); + if (error != 0) + return; + + cup.uaddr = child_clear_tid; + cup.op = LINUX_FUTEX_WAKE; + cup.val = 1; /* wake one */ + cup.timeout = NULL; + cup.uaddr2 = NULL; + cup.val3 = 0; + error = linux_sys_futex(td, &cup); + /* + * this cannot happen at the moment and if this happens it + * probably means there is a user space bug + */ + if (error != 0) + linux_msg(td, "futex stuff in thread_detach failed."); + } } Index: head/sys/compat/linux/linux_futex.c =================================================================== --- head/sys/compat/linux/linux_futex.c (revision 283421) +++ head/sys/compat/linux/linux_futex.c (revision 283422) @@ -1,1236 +1,1236 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * Futex part for the special DTrace module "locks". */ LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); /** * Per futex probes. */ LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_put, return); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *", "struct waiting_proc **", "struct futex **"); LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *", "struct waiting_proc **", "int"); LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", "struct waiting_proc *", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int", "struct futex *", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *", "struct waiting_proc **", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *", "struct linux_sys_futex_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, itimerfix_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq, "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *", "struct linux_set_robust_list_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *", "struct linux_get_robust_list_args *"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct linux_emuldata *", "uint32_t *", "unsigned int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry, "struct linux_robust_list **", "struct linux_robust_list **", "unsigned int *"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *", "struct linux_emuldata *"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return); static MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes"); static MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp"); struct futex; struct waiting_proc { uint32_t wp_flags; struct futex *wp_futex; TAILQ_ENTRY(waiting_proc) wp_list; }; struct futex { struct sx f_lck; uint32_t *f_uaddr; /* user-supplied value, for debug */ struct umtx_key f_key; uint32_t f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; }; struct futex_list futex_list; #define FUTEX_LOCK(f) sx_xlock(&(f)->f_lck) #define FUTEX_UNLOCK(f) sx_xunlock(&(f)->f_lck) #define FUTEX_INIT(f) do { \ sx_init_flags(&(f)->f_lck, "ftlk", \ SX_DUPOK); \ LIN_SDT_PROBE1(futex, futex, create, \ &(f)->f_lck); \ } while (0) #define FUTEX_DESTROY(f) do { \ LIN_SDT_PROBE1(futex, futex, destroy, \ &(f)->f_lck); \ sx_destroy(&(f)->f_lck); \ } while (0) #define FUTEX_ASSERT_LOCKED(f) sx_assert(&(f)->f_lck, SA_XLOCKED) struct mtx futex_mtx; /* protects the futex list */ #define FUTEXES_LOCK do { \ mtx_lock(&futex_mtx); \ LIN_SDT_PROBE1(locks, futex_mtx, \ locked, &futex_mtx); \ } while (0) #define FUTEXES_UNLOCK do { \ LIN_SDT_PROBE1(locks, futex_mtx, \ unlock, &futex_mtx); \ mtx_unlock(&futex_mtx); \ } while (0) /* flags for futex_get() */ #define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ #define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ #define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ #define FUTEX_SHARED 0x8 /* shared futex */ /* wp_flags */ #define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list * of futex where thread sleep to wp_list * of another futex. */ #define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex * wp_list to prevent double wakeup. */ /* support.s */ int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval); int futex_addl(int oparg, uint32_t *uaddr, int *oldval); int futex_orl(int oparg, uint32_t *uaddr, int *oldval); int futex_andl(int oparg, uint32_t *uaddr, int *oldval); int futex_xorl(int oparg, uint32_t *uaddr, int *oldval); static void futex_put(struct futex *f, struct waiting_proc *wp) { LIN_SDT_PROBE2(futex, futex_put, entry, f, wp); FUTEX_ASSERT_LOCKED(f); if (wp != NULL) { if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); free(wp, M_FUTEX_WP); } FUTEXES_LOCK; if (--f->f_refcount == 0) { LIST_REMOVE(f, f_list); FUTEXES_UNLOCK; FUTEX_UNLOCK(f); LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); umtx_key_release(&f->f_key); FUTEX_DESTROY(f); free(f, M_FUTEX); LIN_SDT_PROBE0(futex, futex_put, return); return; } LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEXES_UNLOCK; FUTEX_UNLOCK(f); LIN_SDT_PROBE0(futex, futex_put, return); } static int futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) { struct futex *f, *tmpf; struct umtx_key key; int error; LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags); *newf = tmpf = NULL; error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE, &key); if (error) { LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); LIN_SDT_PROBE1(futex, futex_get0, return, error); return (error); } retry: FUTEXES_LOCK; LIST_FOREACH(f, &futex_list, f_list) { if (umtx_key_match(&f->f_key, &key)) { if (tmpf != NULL) { FUTEX_UNLOCK(tmpf); FUTEX_DESTROY(tmpf); free(tmpf, M_FUTEX); } if (flags & FUTEX_DONTEXISTS) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, return, EINVAL); return (EINVAL); } /* * Increment refcount of the found futex to * prevent it from deallocation before FUTEX_LOCK() */ ++f->f_refcount; FUTEXES_UNLOCK; umtx_key_release(&key); FUTEX_LOCK(f); *newf = f; LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", uaddr, f->f_refcount, f->f_key.shared); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } } if (flags & FUTEX_DONTCREATE) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } if (tmpf == NULL) { FUTEXES_UNLOCK; tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); tmpf->f_uaddr = uaddr; tmpf->f_key = key; tmpf->f_refcount = 1; tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; FUTEX_INIT(tmpf); TAILQ_INIT(&tmpf->f_waiting_proc); /* * Lock the new futex before an insert into the futex_list * to prevent futex usage by other. */ FUTEX_LOCK(tmpf); goto retry; } LIST_INSERT_HEAD(&futex_list, tmpf, f_list); FUTEXES_UNLOCK; LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, tmpf->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", uaddr, tmpf->f_refcount, tmpf->f_key.shared); *newf = tmpf; LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } static int futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, uint32_t flags) { int error; LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f); if (flags & FUTEX_CREATE_WP) { *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); (*wp)->wp_flags = 0; } error = futex_get0(uaddr, f, flags); if (error) { LIN_SDT_PROBE0(futex, futex_get, error); if (flags & FUTEX_CREATE_WP) free(*wp, M_FUTEX_WP); LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } if (flags & FUTEX_CREATE_WP) { TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); (*wp)->wp_futex = *f; } LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } static int futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout) { int error; FUTEX_ASSERT_LOCKED(f); LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout); LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d", f->f_uaddr, wp, timeout, f->f_refcount); error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout); if (wp->wp_flags & FUTEX_WP_REQUEUED) { KASSERT(f != wp->wp_futex, ("futex != wp_futex")); if (error) { LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); } LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" " %p requeued uaddr %p ref %d", error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); futex_put(f, NULL); f = wp->wp_futex; FUTEX_LOCK(f); } else { if (error) { LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, f->f_uaddr, wp); } LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", error, f->f_uaddr, wp); } futex_put(f, wp); LIN_SDT_PROBE1(futex, futex_sleep, return, error); return (error); } static int futex_wake(struct futex *f, int n, uint32_t bitset) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL); return (EINVAL); } FUTEX_ASSERT_LOCKED(f); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, f->f_refcount); LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", f->f_uaddr, wp, f->f_refcount); /* * Unless we find a matching bit in * the bitset, continue searching. */ if (!(wp->wp_futex->f_bitset & bitset)) continue; wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); wakeup_one(wp); if (++count == n) break; } LIN_SDT_PROBE1(futex, futex_wake, return, count); return (count); } static int futex_requeue(struct futex *f, int n, struct futex *f2, int n2) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2); FUTEX_ASSERT_LOCKED(f); FUTEX_ASSERT_LOCKED(f2); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { if (++count <= n) { LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", f->f_uaddr, wp); wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); wakeup_one(wp); } else { LIN_SDT_PROBE3(futex, futex_requeue, requeue, f->f_uaddr, wp, f2->f_uaddr); LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", f->f_uaddr, wp, f2->f_uaddr); wp->wp_flags |= FUTEX_WP_REQUEUED; /* Move wp to wp_list of f2 futex */ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); /* * Thread which sleeps on wp after waking should * acquire f2 lock, so increment refcount of f2 to * prevent it from premature deallocation. */ wp->wp_futex = f2; FUTEXES_LOCK; ++f2->f_refcount; FUTEXES_UNLOCK; if (count - n >= n2) break; } } LIN_SDT_PROBE1(futex, futex_requeue, return, count); return (count); } static int futex_wait(struct futex *f, struct waiting_proc *wp, int timeout_hz, uint32_t bitset) { int error; LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, timeout_hz, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL); return (EINVAL); } f->f_bitset = bitset; error = futex_sleep(f, wp, timeout_hz); if (error) LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); if (error == EWOULDBLOCK) error = ETIMEDOUT; LIN_SDT_PROBE1(futex, futex_wait, return, error); return (error); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr); if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); /* XXX: Linux verifies access here and returns EFAULT */ LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) { LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { int clockrt, nrwake, op_ret, ret; - struct linux_emuldata *em; + struct linux_pemuldata *pem; struct waiting_proc *wp; struct futex *f, *f2; struct l_timespec timeout; struct timeval utv, ctv; int timeout_hz; int error; uint32_t flags, val; LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args); if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else flags = FUTEX_SHARED; /* * Currently support for switching between CLOCK_MONOTONIC and * CLOCK_REALTIME is not present. However Linux forbids the use of * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and * FUTEX_WAIT_REQUEUE_PI. */ clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } error = 0; f = f2 = NULL; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, &wp, &f, flags | FUTEX_CREATE_WP); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin(args->uaddr, &val, sizeof(val)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val) { LIN_SDT_PROBE4(futex, linux_sys_futex, debug_wait_value_neq, args->uaddr, args->val, val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x != uval 0x%x", args->uaddr, args->val, val); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EWOULDBLOCK); return (EWOULDBLOCK); } if (args->timeout != NULL) { error = copyin(args->timeout, &timeout, sizeof(timeout)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); futex_put(f, wp); return (error); } TIMESPEC_TO_TIMEVAL(&utv, &timeout); error = itimerfix(&utv); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, itimerfix_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); futex_put(f, wp); return (error); } if (clockrt) { microtime(&ctv); timevalsub(&utv, &ctv); } else if (args->op == LINUX_FUTEX_WAIT_BITSET) { microuptime(&ctv); timevalsub(&utv, &ctv); } if (utv.tv_sec < 0) timevalclear(&utv); timeout_hz = tvtohz(&utv); } else timeout_hz = 0; error = futex_wait(f, wp, timeout_hz, args->val3); break; case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTCREATE); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (f == NULL) { td->td_retval[0] = 0; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } td->td_retval[0] = futex_wake(f, args->val, args->val3); futex_put(f, NULL); break; case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } error = futex_get(args->uaddr, NULL, &f, flags); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * To avoid deadlocks return EINVAL if second futex * exists at this time. * * Glibc fall back to FUTEX_WAKE in case of any error * returned by FUTEX_CMP_REQUEUE. */ error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTEXISTS); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin(args->uaddr, &val, sizeof(val)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", error); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val3) { LIN_SDT_PROBE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, args->val, val); LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", args->val, val); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN); return (EAGAIN); } nrwake = (int)(unsigned long)args->timeout; td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake); futex_put(f2, NULL); futex_put(f, NULL); break; case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->timeout); error = futex_get(args->uaddr, NULL, &f, flags); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (args->uaddr != args->uaddr2) error = futex_get(args->uaddr2, NULL, &f2, flags); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * This function returns positive number as results and * negative as errors */ op_ret = futex_atomic_op(td, args->val3, args->uaddr2); LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", args->uaddr, op_ret); if (op_ret < 0) { /* XXX: We don't handle the EFAULT yet. */ if (op_ret != -EFAULT) { if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, -op_ret); return (-op_ret); } else { LIN_SDT_PROBE0(futex, linux_sys_futex, unhandled_efault); } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT); return (EFAULT); } ret = futex_wake(f, args->val, args->val3); if (op_ret > 0) { op_ret = 0; nrwake = (int)(unsigned long)args->timeout; if (f2 != NULL) op_ret += futex_wake(f2, nrwake, args->val3); else op_ret += futex_wake(f, nrwake, args->val3); ret += op_ret; } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); td->td_retval[0] = ret; break; case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_LOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_UNLOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_TRYLOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ - em = em_find(td); - if ((em->flags & LINUX_XDEPR_REQUEUEOP) == 0) { + pem = pem_find(td->td_proc); + if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_requeue op\n"); - em->flags |= LINUX_XDEPR_REQUEUEOP; + pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_sys_futex, deprecated_requeue); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op FUTEX_WAIT_REQUEUE_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); default: linux_msg(td, "linux_sys_futex: unknown op %d\n", args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation, args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args); if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0); return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args); if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, ESRCH); return (ESRCH); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT); return (EFAULT); } error = copyout(head, args->head, sizeof(struct linux_robust_list_head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error); return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; struct futex *f; int error; LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi); retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) { LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_get(uaddr, NULL, &f, FUTEX_DONTCREATE | FUTEX_SHARED); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, return, error); return (error); } if (f != NULL) { futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); futex_put(f, NULL); } } } LIN_SDT_PROBE1(futex, handle_futex_death, return, 0); return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi); error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0); return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; LIN_SDT_PROBE2(futex, release_futexes, entry, td, em); head = em->robust_futexes; if (head == NULL) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (rc) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); LIN_SDT_PROBE0(futex, release_futexes, return); } Index: head/sys/compat/linux/linux_misc.c =================================================================== --- head/sys/compat/linux/linux_misc.c (revision 283421) +++ head/sys/compat/linux/linux_misc.c (revision 283422) @@ -1,2347 +1,2362 @@ /*- * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #if defined(__i386__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif +#include #include #include #include #include #include #include #include + +/** + * Special DTrace provider for the linuxulator. + * + * In this file we define the provider for the entire linuxulator. All + * modules (= files of the linuxulator) use it. + * + * We define a different name depending on the emulated bitsize, see + * ../..//linux{,32}/linux.h, e.g.: + * native bitsize = linuxulator + * amd64, 32bit emulation = linuxulator32 + */ +LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE); int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalbig; l_ulong freebig; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; struct l_pselect6arg { l_uintptr_t ss; l_size_t ss_len; }; int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; vm_object_t object; int i, j; struct timespec ts; getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - vm_cnt.v_wire_count * PAGE_SIZE; sysinfo.sharedram = 0; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) if (object->shadow_count > 1) sysinfo.sharedram += object->resident_page_count; mtx_unlock(&vm_object_list_mtx); sysinfo.sharedram *= PAGE_SIZE; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; sysinfo.freebig = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error; #ifdef DEBUG if (ldebug(alarm)) printf(ARGS(alarm, "%u"), args->secs); #endif secs = args->secs; if (secs > INT_MAX) secs = INT_MAX; it.it_value.tv_sec = (long) secs; it.it_value.tv_usec = 0; it.it_interval.tv_sec = 0; it.it_interval.tv_usec = 0; error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); if (error) return (error); if (timevalisset(&old_it.it_value)) { if (old_it.it_value.tv_usec != 0) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; } return (0); } int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; vm_offset_t new, old; struct obreak_args /* { char * nsize; } */ tmp; #ifdef DEBUG if (ldebug(brk)) printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend); #endif old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (vm_offset_t)args->dsend; tmp.nsize = (char *)new; if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp)) td->td_retval[0] = (long)new; else td->td_retval[0] = (long)old; return (0); } #if defined(__i386__) /* XXX: what about amd64/linux32? */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; char *library; ssize_t aresid; int error, locked, writecount; LCONVPATHEXIST(td, args->library, &library); #ifdef DEBUG if (ldebug(uselib)) printf(ARGS(uselib, "%s"), library); #endif a_out = NULL; locked = 0; vp = NULL; NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); if (error) goto cleanup; vp = ni.ni_vp; NDFREE(&ni, NDF_ONLY_PNBUF); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = 1; /* Writable? */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) goto cleanup; if (writecount != 0) { error = ETXTBSY; goto cleanup; } /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. * XXX: Note that if any of the VM operations fail below we don't * clear this flag. */ VOP_SET_TEXT(vp); /* * Lock no longer needed */ locked = 0; VOP_UNLOCK(vp, 0); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("uselib: Non page aligned binary %lu\n", file_offset); #endif /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { #ifdef DEBUG printf("uselib: Page aligned binary %lu\n", file_offset); #endif /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; } #ifdef DEBUG printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0], ((long *)vmaddr)[1]); #endif if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: /* Unlock vnode if needed */ if (locked) VOP_UNLOCK(vp, 0); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } #endif /* __i386__ */ int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; #ifdef DEBUG if (ldebug(select)) printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds, (void *)args->readfds, (void *)args->writefds, (void *)args->exceptfds, (void *)args->timeout); #endif /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; #ifdef DEBUG if (ldebug(select)) printf(LMSG("incoming timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, sizeof(l_int) * 8); #ifdef DEBUG if (ldebug(select)) printf(LMSG("real select returns %d"), error); #endif if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); #ifdef DEBUG if (ldebug(select)) printf(LMSG("outgoing timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: #ifdef DEBUG if (ldebug(select)) printf(LMSG("select_out -> %d"), error); #endif return (error); } int linux_mremap(struct thread *td, struct linux_mremap_args *args) { struct munmap_args /* { void *addr; size_t len; } */ bsd_args; int error = 0; #ifdef DEBUG if (ldebug(mremap)) printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"), (void *)(uintptr_t)args->addr, (unsigned long)args->old_len, (unsigned long)args->new_len, (unsigned long)args->flags); #endif if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { bsd_args.addr = (caddr_t)((uintptr_t)args->addr + args->new_len); bsd_args.len = args->old_len - args->new_len; error = sys_munmap(td, &bsd_args); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { struct msync_args bsd_args; bsd_args.addr = (caddr_t)(uintptr_t)args->addr; bsd_args.len = (uintptr_t)args->len; bsd_args.flags = args->fl & ~LINUX_MS_SYNC; return (sys_msync(td, &bsd_args)); } int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; #ifdef DEBUG if (ldebug(time)) printf(ARGS(time, "*")); #endif microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; #ifdef DEBUG if (ldebug(times)) printf(ARGS(times, "*")); #endif if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; #ifdef DEBUG if (ldebug(newuname)) printf(ARGS(newuname, "*")); #endif linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utime)) printf(ARGS(utime, "%s, *"), fname); #endif if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utimes)) printf(ARGS(utimes, "%s, *"), fname); #endif if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &fname, dfd); #ifdef DEBUG if (ldebug(futimesat)) printf(ARGS(futimesat, "%s, *"), fname); #endif if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_common_wait(struct thread *td, int pid, int *status, int options, struct rusage *ru) { int error, tmpstat; error = kern_wait(td, pid, &tmpstat, options, ru); if (error) return (error); if (status) { tmpstat &= 0xffff; if (WIFSIGNALED(tmpstat)) tmpstat = (tmpstat & 0xffffff80) | BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat)); else if (WIFSTOPPED(tmpstat)) tmpstat = (tmpstat & 0xffff00ff) | (BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8); error = copyout(&tmpstat, status, sizeof(int)); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { int options; #ifdef DEBUG if (ldebug(waitpid)) printf(ARGS(waitpid, "%d, %p, %d"), args->pid, (void *)args->status, args->options); #endif /* * this is necessary because the test in kern_wait doesn't work * because we mess with the options here */ if (args->options & ~(WUNTRACED | WNOHANG | WCONTINUED | __WCLONE)) return (EINVAL); options = (args->options & (WNOHANG | WUNTRACED)); /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ if (args->options & __WCLONE) options |= WLINUXCLONE; return (linux_common_wait(td, args->pid, args->status, options, NULL)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_wait4(struct thread *td, struct linux_wait4_args *args) { int error, options; struct rusage ru, *rup; #ifdef DEBUG if (ldebug(wait4)) printf(ARGS(wait4, "%d, %p, %d, %p"), args->pid, (void *)args->status, args->options, (void *)args->rusage); #endif options = (args->options & (WNOHANG | WUNTRACED)); /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ if (args->options & __WCLONE) options |= WLINUXCLONE; if (args->rusage != NULL) rup = &ru; else rup = NULL; error = linux_common_wait(td, args->pid, args->status, options, rup); if (error != 0) return (error); if (args->rusage != NULL) error = linux_copyout_rusage(&ru, args->rusage); return (error); } int linux_waitid(struct thread *td, struct linux_waitid_args *args) { int status, options, sig; struct __wrusage wru; siginfo_t siginfo; l_siginfo_t lsi; idtype_t idtype; struct proc *p; int error; options = 0; linux_to_bsd_waitopts(args->options, &options); if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED)) return (EINVAL); if (!(options & (WEXITED | WUNTRACED | WCONTINUED))) return (EINVAL); switch (args->idtype) { case LINUX_P_ALL: idtype = P_ALL; break; case LINUX_P_PID: if (args->id <= 0) return (EINVAL); idtype = P_PID; break; case LINUX_P_PGID: if (args->id <= 0) return (EINVAL); idtype = P_PGID; break; default: return (EINVAL); } error = kern_wait6(td, idtype, args->id, &status, options, &wru, &siginfo); if (error != 0) return (error); if (args->rusage != NULL) { error = linux_copyout_rusage(&wru.wru_children, args->rusage); if (error != 0) return (error); } if (args->info != NULL) { p = td->td_proc; if (td->td_retval[0] == 0) bzero(&lsi, sizeof(lsi)); else { sig = BSD_TO_LINUX_SIGNAL(siginfo.si_signo); siginfo_to_lsiginfo(&siginfo, &lsi, sig); } error = copyout(&lsi, args->info, sizeof(lsi)); } td->td_retval[0] = 0; return (error); } int linux_mknod(struct thread *td, struct linux_mknod_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mknod)) printf(ARGS(mknod, "%s, %d, %ju"), path, args->mode, (uintmax_t)args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHCREAT_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(mknodat)) printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, dfd, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { #ifdef DEBUG if (ldebug(personality)) printf(ARGS(personality, "%lu"), (unsigned long)args->per); #endif if (args->per != 0) return (EINVAL); /* Yes Jim, it's still a Linux... */ td->td_retval[0] = 0; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; #ifdef DEBUG if (ldebug(setitimer)) printf(ARGS(setitimer, "%p, %p"), (void *)uap->itv, (void *)uap->oitv); #endif if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); #ifdef DEBUG if (ldebug(setitimer)) { printf("setitimer: value: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec); printf("setitimer: interval: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec); } #endif error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; #ifdef DEBUG if (ldebug(getitimer)) printf(ARGS(getitimer, "%p"), (void *)uap->itv); #endif error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_nice(struct thread *td, struct linux_nice_args *args) { struct setpriority_args bsd_args; bsd_args.which = PRIO_PROCESS; bsd_args.who = 0; /* current process */ bsd_args.prio = args->inc; return (sys_setpriority(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = crcopysafe(p, newcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_TEMP); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_TEMP, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_TEMP); if (error) return (error); td->td_retval[0] = ngrp; return (0); } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; #ifdef DEBUG if (ldebug(setrlimit)) printf(ARGS(setrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(old_getrlimit)) printf(ARGS(old_getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(getrlimit)) printf(ARGS(getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { struct sched_param sched_param; struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_setscheduler)) printf(ARGS(sched_setscheduler, "%d, %d, %p"), args->pid, args->policy, (const void *)args->param); #endif switch (args->policy) { case LINUX_SCHED_OTHER: policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: policy = SCHED_FIFO; break; case LINUX_SCHED_RR: policy = SCHED_RR; break; default: return (EINVAL); } error = copyin(args->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setscheduler(td, tdt, policy, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_getscheduler)) printf(ARGS(sched_getscheduler, "%d"), args->pid); #endif tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_max)) printf(ARGS(sched_get_priority_max, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_min)) printf(ARGS(sched_get_priority_min, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; #ifdef DEBUG if (ldebug(reboot)) printf(ARGS(reboot, "0x%x"), args->cmd); #endif if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } /* * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that * are assumed to be preserved. The following lightweight syscalls fixes * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c * * linux_getpid() - MP SAFE * linux_getgid() - MP SAFE * linux_getuid() - MP SAFE */ int linux_getpid(struct thread *td, struct linux_getpid_args *args) { #ifdef DEBUG if (ldebug(getpid)) printf(ARGS(getpid, "")); #endif td->td_retval[0] = td->td_proc->p_pid; return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { struct linux_emuldata *em; #ifdef DEBUG if (ldebug(gettid)) printf(ARGS(gettid, "")); #endif em = em_find(td); KASSERT(em != NULL, ("gettid: emuldata not found.\n")); td->td_retval[0] = em->em_tid; return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { #ifdef DEBUG if (ldebug(getppid)) printf(ARGS(getppid, "")); #endif PROC_LOCK(td->td_proc); td->td_retval[0] = td->td_proc->p_pptr->p_pid; PROC_UNLOCK(td->td_proc); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { #ifdef DEBUG if (ldebug(getgid)) printf(ARGS(getgid, "")); #endif td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { #ifdef DEBUG if (ldebug(getuid)) printf(ARGS(getuid, "")); #endif td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { struct getsid_args bsd; #ifdef DEBUG if (ldebug(getsid)) printf(ARGS(getsid, "%i"), args->pid); #endif bsd.pid = args->pid; return (sys_getsid(td, &bsd)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { struct getpriority_args bsd_args; int error; #ifdef DEBUG if (ldebug(getpriority)) printf(ARGS(getpriority, "%i, %i"), args->which, args->who); #endif bsd_args.which = args->which; bsd_args.who = args->who; error = sys_getpriority(td, &bsd_args); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(sethostname)) printf(ARGS(sethostname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(setdomainname)) printf(ARGS(setdomainname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { #ifdef DEBUG if (ldebug(exit_group)) printf(ARGS(exit_group, "%i"), args->error_code); #endif LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, W_EXITCODE(args->error_code, 0)); /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION 0x19980330 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (args->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ bzero (&lucd, sizeof(lucd)); error = copyout(&lucd, args->datap, sizeof(lucd)); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL || args->datap == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(args->datap, &lucd, sizeof(lucd)); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ if (lucd.effective || lucd.permitted || lucd.inheritable) { linux_msg(td, "capset effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", (int)lucd.effective, (int)lucd.permitted, (int)lucd.inheritable); return (EPERM); } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; struct linux_emuldata *em; int pdeath_signal; #ifdef DEBUG if (ldebug(prctl)) printf(ARGS(prctl, "%d, %ju, %ju, %ju, %ju"), args->option, (uintmax_t)args->arg2, (uintmax_t)args->arg3, (uintmax_t)args->arg4, (uintmax_t)args->arg5); #endif switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); em->pdeath_signal = args->arg2; break; case LINUX_PR_GET_PDEATHSIG: em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); pdeath_signal = em->pdeath_signal; error = copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal)); break; case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; default: error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_setparam)) printf(ARGS(sched_setparam, "%d, *"), uap->pid); #endif error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_getparam)) printf(ARGS(sched_getparam, "%d, *"), uap->pid); #endif tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); if (error == 0) error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { int error; struct thread *tdt; struct cpuset_getaffinity_args cga; #ifdef DEBUG if (ldebug(sched_getaffinity)) printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); cga.level = CPU_LEVEL_WHICH; cga.which = CPU_WHICH_TID; cga.id = tdt->td_tid; cga.cpusetsize = sizeof(cpuset_t); cga.mask = (cpuset_t *) args->user_mask_ptr; if ((error = sys_cpuset_getaffinity(td, &cga)) == 0) td->td_retval[0] = sizeof(cpuset_t); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct cpuset_setaffinity_args csa; struct thread *tdt; #ifdef DEBUG if (ldebug(sched_setaffinity)) printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); csa.level = CPU_LEVEL_WHICH; csa.which = CPU_WHICH_TID; csa.id = tdt->td_tid; csa.cpusetsize = sizeof(cpuset_t); csa.mask = (cpuset_t *) args->user_mask_ptr; return (sys_cpuset_setaffinity(td, &csa)); } struct linux_rlimit64 { uint64_t rlim_cur; uint64_t rlim_max; }; int linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args) { struct rlimit rlim, nrlim; struct linux_rlimit64 lrlim; struct proc *p; u_int which; int flags; int error; #ifdef DEBUG if (ldebug(prlimit64)) printf(ARGS(prlimit64, "%d, %d, %p, %p"), args->pid, args->resource, (void *)args->new, (void *)args->old); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); if (args->new != NULL) { /* * Note. Unlike FreeBSD where rlim is signed 64-bit Linux * rlim is unsigned 64-bit. FreeBSD treats negative limits * as INFINITY so we do not need a conversion even. */ error = copyin(args->new, &nrlim, sizeof(nrlim)); if (error != 0) return (error); } flags = PGET_HOLD | PGET_NOTWEXIT; if (args->new != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget(args->pid, flags, &p); if (error != 0) return (error); if (args->old != NULL) { PROC_LOCK(p); lim_rlimit(p, which, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur == RLIM_INFINITY) lrlim.rlim_cur = LINUX_RLIM_INFINITY; else lrlim.rlim_cur = rlim.rlim_cur; if (rlim.rlim_max == RLIM_INFINITY) lrlim.rlim_max = LINUX_RLIM_INFINITY; else lrlim.rlim_max = rlim.rlim_max; error = copyout(&lrlim, args->old, sizeof(lrlim)); if (error != 0) goto out; } if (args->new != NULL) error = kern_proc_setrlimit(td, p, which, &nrlim); out: PRELE(p); return (error); } int linux_pselect6(struct thread *td, struct linux_pselect6_args *args) { struct timeval utv, tv0, tv1, *tvp; struct l_pselect6arg lpse6; struct l_timespec lts; struct timespec uts; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; ssp = NULL; if (args->sig != NULL) { error = copyin(args->sig, &lpse6, sizeof(lpse6)); if (error != 0) return (error); if (lpse6.ss_len != sizeof(l_ss)) return (EINVAL); if (lpse6.ss != 0) { error = copyin(PTRIN(lpse6.ss), &l_ss, sizeof(l_ss)); if (error != 0) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } } /* * Currently glibc changes nanosecond number to microsecond. * This mean losing precision but for now it is hardly seen. */ if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); uts.tv_sec = lts.tv_sec; uts.tv_nsec = lts.tv_nsec; TIMESPEC_TO_TIMEVAL(&utv, &uts); if (itimerfix(&utv)) return (EINVAL); microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_pselect(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, ssp, sizeof(l_int) * 8); if (error == 0 && args->tsp != NULL) { if (td->td_retval[0] != 0) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); TIMEVAL_TO_TIMESPEC(&utv, &uts); lts.tv_sec = uts.tv_sec; lts.tv_nsec = uts.tv_nsec; error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } #if defined(DEBUG) || defined(KTR) /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */ u_char linux_debug_map[howmany(LINUX_SYS_MAXSYSCALL, sizeof(u_char))]; static int linux_debug(int syscall, int toggle, int global) { if (global) { char c = toggle ? 0 : 0xff; memset(linux_debug_map, c, sizeof(linux_debug_map)); return (0); } if (syscall < 0 || syscall >= LINUX_SYS_MAXSYSCALL) return (EINVAL); if (toggle) clrbit(linux_debug_map, syscall); else setbit(linux_debug_map, syscall); return (0); } /* * Usage: sysctl linux.debug=.<0/1> * * E.g.: sysctl linux.debug=21.0 * * As a special case, syscall "all" will apply to all syscalls globally. */ #define LINUX_MAX_DEBUGSTR 16 int linux_sysctl_debug(SYSCTL_HANDLER_ARGS) { char value[LINUX_MAX_DEBUGSTR], *p; int error, sysc, toggle; int global = 0; value[0] = '\0'; error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req); if (error || req->newptr == NULL) return (error); for (p = value; *p != '\0' && *p != '.'; p++); if (*p == '\0') return (EINVAL); *p++ = '\0'; sysc = strtol(value, NULL, 0); toggle = strtol(p, NULL, 0); if (strcmp(value, "all") == 0) global = 1; error = linux_debug(sysc, toggle, global); return (error); } #endif /* DEBUG || KTR */ int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; struct l_timespec lts; struct thread *tdt; int error; /* * According to man in case the invalid pid specified * EINVAL should be returned. */ if (uap->pid < 0) return (EINVAL); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, &ts); PROC_UNLOCK(tdt->td_proc); if (error != 0) return (error); lts.tv_sec = ts.tv_sec; lts.tv_nsec = ts.tv_nsec; return (copyout(<s, uap->interval, sizeof(lts))); } /* * In case when the Linux thread is the initial thread in * the thread group thread id is equal to the process id. * Glibc depends on this magic (assert in pthread_getattr_np.c). */ struct thread * linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) { struct linux_emuldata *em; struct thread *tdt; struct proc *p; tdt = NULL; if (tid == 0 || tid == td->td_tid) { tdt = td; PROC_LOCK(tdt->td_proc); } else if (tid > PID_MAX) tdt = tdfind(tid, pid); else { /* * Initial thread where the tid equal to the pid. */ p = pfind(tid); if (p != NULL) { if (SV_PROC_ABI(p) != SV_ABI_LINUX) { /* * p is not a Linuxulator process. */ PROC_UNLOCK(p); return (NULL); } FOREACH_THREAD_IN_PROC(p, tdt) { em = em_find(tdt); if (tid == em->em_tid) return (tdt); } PROC_UNLOCK(p); } return (NULL); } return (tdt); } void linux_to_bsd_waitopts(int options, int *bsdopts) { if (options & LINUX_WNOHANG) *bsdopts |= WNOHANG; if (options & LINUX_WUNTRACED) *bsdopts |= WUNTRACED; if (options & LINUX_WEXITED) *bsdopts |= WEXITED; if (options & LINUX_WCONTINUED) *bsdopts |= WCONTINUED; if (options & LINUX_WNOWAIT) *bsdopts |= WNOWAIT; if (options & __WCLONE) *bsdopts |= WLINUXCLONE; } Index: head/sys/modules/linux/Makefile =================================================================== --- head/sys/modules/linux/Makefile (revision 283421) +++ head/sys/modules/linux/Makefile (revision 283422) @@ -1,84 +1,85 @@ # $FreeBSD$ .if ${MACHINE_CPUARCH} == "amd64" SFX= 32 CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 .endif .PATH: ${.CURDIR}/../../compat/linux ${.CURDIR}/../../${MACHINE_CPUARCH}/linux${SFX} VDSO= linux${SFX}_vdso KMOD= linux -SRCS= linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \ +SRCS= linux_fork.c linux${SFX}_dummy.c linux_file.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ linux${SFX}_machdep.c linux_misc.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \ linux${SFX}_sysvec.c linux_uid16.c linux_time.c \ linux_timer.c linux_vdso.c \ opt_inet6.h opt_compat.h opt_posix.h opt_usb.h vnode_if.h \ device_if.h bus_if.h assym.s \ linux${SFX}_support.s DPSRCS= linux${SFX}_genassym.c # XXX: for assym.s SRCS+= opt_kstack_pages.h opt_nfs.h opt_compat.h opt_hwpmc_hooks.h .if ${MACHINE_CPUARCH} == "i386" SRCS+= opt_apic.h .endif OBJS= ${VDSO}.so .if ${MACHINE_CPUARCH} == "i386" -SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c opt_cpu.h +SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c \ + linux_emul.c opt_cpu.h .endif .if ${MACHINE_CPUARCH} == "i386" EXPORT_SYMS= EXPORT_SYMS+= linux_emul_path EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease EXPORT_SYMS+= linux_ioctl_register_handler EXPORT_SYMS+= linux_ioctl_unregister_handler .endif CLEANFILES= linux${SFX}_assym.h linux${SFX}_genassym.o linux${SFX}_locore.o linux${SFX}_assym.h: linux${SFX}_genassym.o sh ${SYSDIR}/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET} linux${SFX}_locore.o: linux${SFX}_assym.h assym.s ${CC} -x assembler-with-cpp -DLOCORE -m32 -shared -s \ -pipe -I. -I${SYSDIR} -Werror -Wall -fno-common -nostdinc -nostdlib \ -fno-omit-frame-pointer \ -Wl,-T${.CURDIR}/../../${MACHINE_CPUARCH}/linux${SFX}/${VDSO}.lds.s \ -Wl,-soname=${VDSO}.so.1,--eh-frame-hdr,-fPIC,-warn-common \ ${.IMPSRC} -o ${.TARGET} linux${SFX}_support.o: linux${SFX}_assym.h assym.s ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} .if ${MACHINE_CPUARCH} == "amd64" ${VDSO}.so: linux${SFX}_locore.o ${OBJCOPY} --input binary --output elf64-x86-64-freebsd \ --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} .else ${VDSO}.so: linux${SFX}_locore.o ${OBJCOPY} --input binary --output elf32-i386-freebsd \ --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} .endif linux${SFX}_genassym.o: ${CC} -c ${CFLAGS:N-fno-common} ${.IMPSRC} .if !defined(KERNBUILDDIR) .if defined(KTR) CFLAGS+= -DKTR .endif .if defined(DEBUG) CFLAGS+= -DDEBUG .endif .endif .include Index: head/sys/modules/linux_common/Makefile =================================================================== --- head/sys/modules/linux_common/Makefile (revision 283421) +++ head/sys/modules/linux_common/Makefile (revision 283422) @@ -1,25 +1,25 @@ # $FreeBSD$ .PATH: ${.CURDIR}/../../compat/linux KMOD= linux_common -SRCS= linux_common.c linux_mib.c linux_util.c \ +SRCS= linux_common.c linux_mib.c linux_util.c linux_emul.c \ opt_compat.h device_if.h vnode_if.h bus_if.h EXPORT_SYMS= EXPORT_SYMS+= linux_emul_path EXPORT_SYMS+= linux_ioctl_register_handler EXPORT_SYMS+= linux_ioctl_unregister_handler EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease .if !defined(KERNBUILDDIR) .if defined(DEBUG) CFLAGS+=-DDEBUG .endif .if defined(KTR) CFLAGS+=-DKTR .endif .endif .include