Index: head/sys/amd64/linux32/linux32_machdep.c =================================================================== --- head/sys/amd64/linux32/linux32_machdep.c (revision 283382) +++ head/sys/amd64/linux32/linux32_machdep.c (revision 283383) @@ -1,1061 +1,1060 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct l_old_select_argv { l_int nfds; l_uintptr_t readfds; l_uintptr_t writefds; l_uintptr_t exceptfds; l_uintptr_t timeout; } __packed; int linux_to_bsd_sigaltstack(int lsa) { int bsa = 0; if (lsa & LINUX_SS_DISABLE) bsa |= SS_DISABLE; if (lsa & LINUX_SS_ONSTACK) bsa |= SS_ONSTACK; return (bsa); } static int linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, l_int flags, l_int fd, l_loff_t pos); int bsd_to_linux_sigaltstack(int bsa) { int lsa = 0; if (bsa & SS_DISABLE) lsa |= LINUX_SS_DISABLE; if (bsa & SS_ONSTACK) lsa |= LINUX_SS_ONSTACK; return (lsa); } static void bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru) { lru->ru_utime.tv_sec = ru->ru_utime.tv_sec; lru->ru_utime.tv_usec = ru->ru_utime.tv_usec; lru->ru_stime.tv_sec = ru->ru_stime.tv_sec; lru->ru_stime.tv_usec = ru->ru_stime.tv_usec; lru->ru_maxrss = ru->ru_maxrss; lru->ru_ixrss = ru->ru_ixrss; lru->ru_idrss = ru->ru_idrss; lru->ru_isrss = ru->ru_isrss; lru->ru_minflt = ru->ru_minflt; lru->ru_majflt = ru->ru_majflt; lru->ru_nswap = ru->ru_nswap; lru->ru_inblock = ru->ru_inblock; lru->ru_oublock = ru->ru_oublock; lru->ru_msgsnd = ru->ru_msgsnd; lru->ru_msgrcv = ru->ru_msgrcv; lru->ru_nsignals = ru->ru_nsignals; lru->ru_nvcsw = ru->ru_nvcsw; lru->ru_nivcsw = ru->ru_nivcsw; } int linux_execve(struct thread *td, struct linux_execve_args *args) { struct image_args eargs; struct vmspace *oldvmspace; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(execve)) printf(ARGS(execve, "%s"), path); #endif error = pre_execve(td, &oldvmspace); if (error != 0) { free(path, M_TEMP); return (error); } error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, args->envp); free(path, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); - if (error == 0) { - /* Linux process can execute FreeBSD one, do not attempt - * to create emuldata for such process using - * linux_proc_init, this leads to a panic on KASSERT - * because such process has p->p_emuldata == NULL. - */ - if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX) - error = linux_proc_init(td, 0, 0); - } + if (error == 0) + error = linux_common_execve(td, &eargs); post_execve(td, error, oldvmspace); return (error); } CTASSERT(sizeof(struct l_iovec32) == 8); static int linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) { struct l_iovec32 iov32; struct iovec *iov; struct uio *uio; uint32_t iovlen; int error, i; *uiop = NULL; if (iovcnt > UIO_MAXIOV) return (EINVAL); iovlen = iovcnt * sizeof(struct iovec); uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); iov = (struct iovec *)(uio + 1); for (i = 0; i < iovcnt; i++) { error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); if (error) { free(uio, M_IOV); return (error); } iov[i].iov_base = PTRIN(iov32.iov_base); iov[i].iov_len = iov32.iov_len; } uio->uio_iov = iov; uio->uio_iovcnt = iovcnt; uio->uio_segflg = UIO_USERSPACE; uio->uio_offset = -1; uio->uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - uio->uio_resid) { free(uio, M_IOV); return (EINVAL); } uio->uio_resid += iov->iov_len; iov++; } *uiop = uio; return (0); } int linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, int error) { struct l_iovec32 iov32; struct iovec *iov; uint32_t iovlen; int i; *iovp = NULL; if (iovcnt > UIO_MAXIOV) return (error); iovlen = iovcnt * sizeof(struct iovec); iov = malloc(iovlen, M_IOV, M_WAITOK); for (i = 0; i < iovcnt; i++) { error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); if (error) { free(iov, M_IOV); return (error); } iov[i].iov_base = PTRIN(iov32.iov_base); iov[i].iov_len = iov32.iov_len; } *iovp = iov; return(0); } int linux_readv(struct thread *td, struct linux_readv_args *uap) { struct uio *auio; int error; error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int linux_writev(struct thread *td, struct linux_writev_args *uap) { struct uio *auio; int error; error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } struct l_ipc_kludge { l_uintptr_t msgp; l_long msgtyp; } __packed; int linux_ipc(struct thread *td, struct linux_ipc_args *args) { switch (args->what & 0xFFFF) { case LINUX_SEMOP: { struct linux_semop_args a; a.semid = args->arg1; a.tsops = args->ptr; a.nsops = args->arg2; return (linux_semop(td, &a)); } case LINUX_SEMGET: { struct linux_semget_args a; a.key = args->arg1; a.nsems = args->arg2; a.semflg = args->arg3; return (linux_semget(td, &a)); } case LINUX_SEMCTL: { struct linux_semctl_args a; int error; a.semid = args->arg1; a.semnum = args->arg2; a.cmd = args->arg3; error = copyin(args->ptr, &a.arg, sizeof(a.arg)); if (error) return (error); return (linux_semctl(td, &a)); } case LINUX_MSGSND: { struct linux_msgsnd_args a; a.msqid = args->arg1; a.msgp = args->ptr; a.msgsz = args->arg2; a.msgflg = args->arg3; return (linux_msgsnd(td, &a)); } case LINUX_MSGRCV: { struct linux_msgrcv_args a; a.msqid = args->arg1; a.msgsz = args->arg2; a.msgflg = args->arg3; if ((args->what >> 16) == 0) { struct l_ipc_kludge tmp; int error; if (args->ptr == 0) return (EINVAL); error = copyin(args->ptr, &tmp, sizeof(tmp)); if (error) return (error); a.msgp = PTRIN(tmp.msgp); a.msgtyp = tmp.msgtyp; } else { a.msgp = args->ptr; a.msgtyp = args->arg5; } return (linux_msgrcv(td, &a)); } case LINUX_MSGGET: { struct linux_msgget_args a; a.key = args->arg1; a.msgflg = args->arg2; return (linux_msgget(td, &a)); } case LINUX_MSGCTL: { struct linux_msgctl_args a; a.msqid = args->arg1; a.cmd = args->arg2; a.buf = args->ptr; return (linux_msgctl(td, &a)); } case LINUX_SHMAT: { struct linux_shmat_args a; a.shmid = args->arg1; a.shmaddr = args->ptr; a.shmflg = args->arg2; a.raddr = PTRIN((l_uint)args->arg3); return (linux_shmat(td, &a)); } case LINUX_SHMDT: { struct linux_shmdt_args a; a.shmaddr = args->ptr; return (linux_shmdt(td, &a)); } case LINUX_SHMGET: { struct linux_shmget_args a; a.key = args->arg1; a.size = args->arg2; a.shmflg = args->arg3; return (linux_shmget(td, &a)); } case LINUX_SHMCTL: { struct linux_shmctl_args a; a.shmid = args->arg1; a.cmd = args->arg2; a.buf = args->ptr; return (linux_shmctl(td, &a)); } default: break; } return (EINVAL); } int linux_old_select(struct thread *td, struct linux_old_select_args *args) { struct l_old_select_argv linux_args; struct linux_select_args newsel; int error; #ifdef DEBUG if (ldebug(old_select)) printf(ARGS(old_select, "%p"), args->ptr); #endif error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); newsel.nfds = linux_args.nfds; newsel.readfds = PTRIN(linux_args.readfds); newsel.writefds = PTRIN(linux_args.writefds); newsel.exceptfds = PTRIN(linux_args.exceptfds); newsel.timeout = PTRIN(linux_args.timeout); return (linux_select(td, &newsel)); } int linux_set_cloned_tls(struct thread *td, void *desc) { struct user_segment_descriptor sd; struct l_user_desc info; struct pcb *pcb; int error; int a[2]; error = copyin(desc, &info, sizeof(struct l_user_desc)); if (error) { printf(LMSG("copyin failed!")); } else { /* We might copy out the entry_number as GUGS32_SEL. */ info.entry_number = GUGS32_SEL; error = copyout(&info, desc, sizeof(struct l_user_desc)); if (error) printf(LMSG("copyout failed!")); a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); memcpy(&sd, &a, sizeof(a)); #ifdef DEBUG if (ldebug(clone)) printf("Segment created in clone with " "CLONE_SETTLS: lobase: %x, hibase: %x, " "lolimit: %x, hilimit: %x, type: %i, " "dpl: %i, p: %i, xx: %i, long: %i, " "def32: %i, gran: %i\n", sd.sd_lobase, sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, sd.sd_long, sd.sd_def32, sd.sd_gran); #endif pcb = td->td_pcb; pcb->pcb_gsbase = (register_t)info.base_addr; /* XXXKIB pcb->pcb_gs32sd = sd; */ td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); set_pcb_flags(pcb, PCB_32BIT); } return (error); } int linux_set_upcall_kse(struct thread *td, register_t stack) { - td->td_frame->tf_rsp = stack; + if (stack) + td->td_frame->tf_rsp = stack; + /* + * The newly created Linux thread returns + * to the user space by the same path that a parent do. + */ + td->td_frame->tf_rax = 0; return (0); } #define STACK_SIZE (2 * 1024 * 1024) #define GUARD_SIZE (4 * PAGE_SIZE) int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { #ifdef DEBUG if (ldebug(mmap2)) printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"), args->addr, args->len, args->prot, args->flags, args->fd, args->pgoff); #endif return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * PAGE_SIZE)); } int linux_mmap(struct thread *td, struct linux_mmap_args *args) { int error; struct l_mmap_argv linux_args; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); #ifdef DEBUG if (ldebug(mmap)) printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"), linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pgoff); #endif return (linux_mmap_common(td, linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, (uint32_t)linux_args.pgoff)); } static int linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, l_int flags, l_int fd, l_loff_t pos) { struct proc *p = td->td_proc; struct mmap_args /* { caddr_t addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; } */ bsd_args; int error; struct file *fp; cap_rights_t rights; error = 0; bsd_args.flags = 0; fp = NULL; /* * Linux mmap(2): * You must specify exactly one of MAP_SHARED and MAP_PRIVATE */ if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) return (EINVAL); if (flags & LINUX_MAP_SHARED) bsd_args.flags |= MAP_SHARED; if (flags & LINUX_MAP_PRIVATE) bsd_args.flags |= MAP_PRIVATE; if (flags & LINUX_MAP_FIXED) bsd_args.flags |= MAP_FIXED; if (flags & LINUX_MAP_ANON) { /* Enforce pos to be on page boundary, then ignore. */ if ((pos & PAGE_MASK) != 0) return (EINVAL); pos = 0; bsd_args.flags |= MAP_ANON; } else bsd_args.flags |= MAP_NOSYNC; if (flags & LINUX_MAP_GROWSDOWN) bsd_args.flags |= MAP_STACK; /* * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC * on Linux/i386. We do this to ensure maximum compatibility. * Linux/ia64 does the same in i386 emulation mode. */ bsd_args.prot = prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; if (bsd_args.fd != -1) { /* * Linux follows Solaris mmap(2) description: * The file descriptor fildes is opened with * read permission, regardless of the * protection options specified. */ error = fget(td, bsd_args.fd, cap_rights_init(&rights, CAP_MMAP), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EINVAL); } /* Linux mmap() just fails for O_WRONLY files */ if (!(fp->f_flag & FREAD)) { fdrop(fp, td); return (EACCES); } fdrop(fp, td); } if (flags & LINUX_MAP_GROWSDOWN) { /* * The Linux MAP_GROWSDOWN option does not limit auto * growth of the region. Linux mmap with this option * takes as addr the inital BOS, and as len, the initial * region size. It can then grow down from addr without * limit. However, Linux threads has an implicit internal * limit to stack size of STACK_SIZE. Its just not * enforced explicitly in Linux. But, here we impose * a limit of (STACK_SIZE - GUARD_SIZE) on the stack * region, since we can do this with our mmap. * * Our mmap with MAP_STACK takes addr as the maximum * downsize limit on BOS, and as len the max size of * the region. It then maps the top SGROWSIZ bytes, * and auto grows the region down, up to the limit * in addr. * * If we don't use the MAP_STACK option, the effect * of this code is to allocate a stack region of a * fixed size of (STACK_SIZE - GUARD_SIZE). */ if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { /* * Some Linux apps will attempt to mmap * thread stacks near the top of their * address space. If their TOS is greater * than vm_maxsaddr, vm_map_growstack() * will confuse the thread stack with the * process stack and deliver a SEGV if they * attempt to grow the thread stack past their * current stacksize rlimit. To avoid this, * adjust vm_maxsaddr upwards to reflect * the current stacksize rlimit rather * than the maximum possible stacksize. * It would be better to adjust the * mmap'ed region, but some apps do not check * mmap's return value. */ PROC_LOCK(p); p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK - lim_cur(p, RLIMIT_STACK); PROC_UNLOCK(p); } /* * This gives us our maximum stack size and a new BOS. * If we're using VM_STACK, then mmap will just map * the top SGROWSIZ bytes, and let the stack grow down * to the limit at BOS. If we're not using VM_STACK * we map the full stack, since we don't have a way * to autogrow it. */ if (len > STACK_SIZE - GUARD_SIZE) { bsd_args.addr = (caddr_t)PTRIN(addr); bsd_args.len = len; } else { bsd_args.addr = (caddr_t)PTRIN(addr) - (STACK_SIZE - GUARD_SIZE - len); bsd_args.len = STACK_SIZE - GUARD_SIZE; } } else { bsd_args.addr = (caddr_t)PTRIN(addr); bsd_args.len = len; } bsd_args.pos = pos; #ifdef DEBUG if (ldebug(mmap)) printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", __func__, (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); #endif error = sys_mmap(td, &bsd_args); #ifdef DEBUG if (ldebug(mmap)) printf("-> %s() return: 0x%x (0x%08x)\n", __func__, error, (u_int)td->td_retval[0]); #endif return (error); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { struct mprotect_args bsd_args; bsd_args.addr = uap->addr; bsd_args.len = uap->len; bsd_args.prot = uap->prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; return (sys_mprotect(td, &bsd_args)); } int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; if (args->level < 0 || args->level > 3) return (EINVAL); if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | (args->level * (PSL_IOPL / 3)); return (0); } int linux_sigaction(struct thread *td, struct linux_sigaction_args *args) { l_osigaction_t osa; l_sigaction_t act, oact; int error; #ifdef DEBUG if (ldebug(sigaction)) printf(ARGS(sigaction, "%d, %p, %p"), args->sig, (void *)args->nsa, (void *)args->osa); #endif if (args->nsa != NULL) { error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); if (error) return (error); act.lsa_handler = osa.lsa_handler; act.lsa_flags = osa.lsa_flags; act.lsa_restorer = osa.lsa_restorer; LINUX_SIGEMPTYSET(act.lsa_mask); act.lsa_mask.__bits[0] = osa.lsa_mask; } error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, args->osa ? &oact : NULL); if (args->osa != NULL && !error) { osa.lsa_handler = oact.lsa_handler; osa.lsa_flags = oact.lsa_flags; osa.lsa_restorer = oact.lsa_restorer; osa.lsa_mask = oact.lsa_mask.__bits[0]; error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); } return (error); } /* * Linux has two extra args, restart and oldmask. We don't use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ int linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) { sigset_t sigmask; l_sigset_t mask; #ifdef DEBUG if (ldebug(sigsuspend)) printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); #endif LINUX_SIGEMPTYSET(mask); mask.__bits[0] = args->mask; linux_to_bsd_sigset(&mask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) { l_sigset_t lmask; sigset_t sigmask; int error; #ifdef DEBUG if (ldebug(rt_sigsuspend)) printf(ARGS(rt_sigsuspend, "%p, %d"), (void *)uap->newset, uap->sigsetsize); #endif if (uap->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); if (error) return (error); linux_to_bsd_sigset(&lmask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_pause(struct thread *td, struct linux_pause_args *args) { struct proc *p = td->td_proc; sigset_t sigmask; #ifdef DEBUG if (ldebug(pause)) printf(ARGS(pause, "")); #endif PROC_LOCK(p); sigmask = td->td_sigmask; PROC_UNLOCK(p); return (kern_sigsuspend(td, sigmask)); } int linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) { stack_t ss, oss; l_stack_t lss; int error; #ifdef DEBUG if (ldebug(sigaltstack)) printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); #endif if (uap->uss != NULL) { error = copyin(uap->uss, &lss, sizeof(l_stack_t)); if (error) return (error); ss.ss_sp = PTRIN(lss.ss_sp); ss.ss_size = lss.ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); } error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, (uap->uoss != NULL) ? &oss : NULL); if (!error && uap->uoss != NULL) { lss.ss_sp = PTROUT(oss.ss_sp); lss.ss_size = oss.ss_size; lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); } return (error); } int linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) { struct ftruncate_args sa; #ifdef DEBUG if (ldebug(ftruncate64)) printf(ARGS(ftruncate64, "%u, %jd"), args->fd, (intmax_t)args->length); #endif sa.fd = args->fd; sa.length = args->length; return sys_ftruncate(td, &sa); } int linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) { struct timeval atv; l_timeval atv32; struct timezone rtz; int error = 0; if (uap->tp) { microtime(&atv); atv32.tv_sec = atv.tv_sec; atv32.tv_usec = atv.tv_usec; error = copyout(&atv32, uap->tp, sizeof(atv32)); } if (error == 0 && uap->tzp != NULL) { rtz.tz_minuteswest = tz_minuteswest; rtz.tz_dsttime = tz_dsttime; error = copyout(&rtz, uap->tzp, sizeof(rtz)); } return (error); } int linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) { l_timeval atv32; struct timeval atv, *tvp; struct timezone atz, *tzp; int error; if (uap->tp) { error = copyin(uap->tp, &atv32, sizeof(atv32)); if (error) return (error); atv.tv_sec = atv32.tv_sec; atv.tv_usec = atv32.tv_usec; tvp = &atv; } else tvp = NULL; if (uap->tzp) { error = copyin(uap->tzp, &atz, sizeof(atz)); if (error) return (error); tzp = &atz; } else tzp = NULL; return (kern_settimeofday(td, tvp, tzp)); } int linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) { struct l_rusage s32; struct rusage s; int error; error = kern_getrusage(td, uap->who, &s); if (error != 0) return (error); if (uap->rusage != NULL) { bsd_to_linux_rusage(&s, &s32); error = copyout(&s32, uap->rusage, sizeof(s32)); } return (error); } int linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args) { struct l_user_desc info; struct user_segment_descriptor sd; struct pcb *pcb; int a[2]; int error; error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); #ifdef DEBUG if (ldebug(set_thread_area)) printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, " "%i, %i, %i"), info.entry_number, info.base_addr, info.limit, info.seg_32bit, info.contents, info.read_exec_only, info.limit_in_pages, info.seg_not_present, info.useable); #endif /* * Semantics of Linux version: every thread in the system has array * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. * This syscall loads one of the selected TLS decriptors with a value * and also loads GDT descriptors 6, 7 and 8 with the content of * the per-thread descriptors. * * Semantics of FreeBSD version: I think we can ignore that Linux has * three per-thread descriptors and use just the first one. * The tls_array[] is used only in [gs]et_thread_area() syscalls and * for loading the GDT descriptors. We use just one GDT descriptor * for TLS, so we will load just one. * * XXX: This doesn't work when a user space process tries to use more * than one TLS segment. Comment in the Linux source says wine might * do this. */ /* * GLIBC reads current %gs and call set_thread_area() with it. * We should let GUDATA_SEL and GUGS32_SEL proceed as well because * we use these segments. */ switch (info.entry_number) { case GUGS32_SEL: case GUDATA_SEL: case 6: case -1: info.entry_number = GUGS32_SEL; break; default: return (EINVAL); } /* * We have to copy out the GDT entry we use. * * XXX: What if a user space program does not check the return value * and tries to use 6, 7 or 8? */ error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (error); if (LINUX_LDT_empty(&info)) { a[0] = 0; a[1] = 0; } else { a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); } memcpy(&sd, &a, sizeof(a)); #ifdef DEBUG if (ldebug(set_thread_area)) printf("Segment created in set_thread_area: " "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, " "type: %i, dpl: %i, p: %i, xx: %i, long: %i, " "def32: %i, gran: %i\n", sd.sd_lobase, sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, sd.sd_long, sd.sd_def32, sd.sd_gran); #endif pcb = td->td_pcb; pcb->pcb_gsbase = (register_t)info.base_addr; set_pcb_flags(pcb, PCB_32BIT); update_gdt_gsbase(td, info.base_addr); return (0); } int linux_wait4(struct thread *td, struct linux_wait4_args *args) { int error, options; struct rusage ru, *rup; struct l_rusage lru; #ifdef DEBUG if (ldebug(wait4)) printf(ARGS(wait4, "%d, %p, %d, %p"), args->pid, (void *)args->status, args->options, (void *)args->rusage); #endif options = (args->options & (WNOHANG | WUNTRACED)); /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ if (args->options & __WCLONE) options |= WLINUXCLONE; if (args->rusage != NULL) rup = &ru; else rup = NULL; error = linux_common_wait(td, args->pid, args->status, options, rup); if (error) return (error); if (args->rusage != NULL) { bsd_to_linux_rusage(rup, &lru); error = copyout(&lru, args->rusage, sizeof(lru)); } return (error); } Index: head/sys/amd64/linux32/linux32_sysvec.c =================================================================== --- head/sys/amd64/linux32/linux32_sysvec.c (revision 283382) +++ head/sys/amd64/linux32/linux32_sysvec.c (revision 283383) @@ -1,1184 +1,1185 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); #define AUXARGS_ENTRY_32(pos, id, val) \ do { \ suword32(pos++, id); \ suword32(pos++, val); \ } while (0) #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 #define LINUX_SYS_linux_sendsig 0 const char *linux_platform = "i686"; static int linux_szplatform; extern char linux_sigcode[]; extern int linux_szsigcode; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); SET_DECLARE(linux_device_handler_set, struct linux_device_handler); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static register_t *linux_copyout_strings(struct image_params *imgp); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static void linux32_fixlimit(struct rlimit *rl, int which); static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static eventhandler_tag linux_exit_tag; static eventhandler_tag linux_exec_tag; +static eventhandler_tag linux_thread_dtor_tag; /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 0, LINUX_SIGUSR1, LINUX_SIGUSR2 }; int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGTRAP, SIGABRT, SIGBUS, SIGFPE, SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, SIGPIPE, SIGALRM, SIGTERM, SIGBUS, SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, SIGIO, SIGURG, SIGSYS }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); base = (Elf32_Addr *)*stack_base; args = (Elf32_Auxargs *)imgp->auxargs; pos = base + (imgp->args->argc + imgp->args->envc + 2); AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent); AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY_32(pos, AT_BASE, args->base); AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0); AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform)); if (args->execfd != -1) AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY_32(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword32(base, (uint32_t)imgp->args->argc); *stack_base = (register_t *)base; return 0; } extern unsigned long linux_sznonrtsigcode; static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack, i; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) frame.sf_extramask[i] = lmask.__bits[i+1]; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags, i; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__bits[0] = frame.sf_sc.sc_mask; for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) lmask.__bits[i+1] = frame.sf_extramask[i]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; p = td->td_proc; frame = td->td_frame; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an * alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = imgp->ps_strings; fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); td->td_retval[1] = 0; } /* * XXX copied from ia32_sysvec.c. */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; u_int32_t *vectp; char *stringp, *destp; u_int32_t *stack_base; struct linux32_ps_strings *arginfo; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * Install LINUX_PLATFORM */ copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform), linux_szplatform); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(u_int32_t)); } else /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (u_int32_t *)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t)); /* * vectp also becomes our initial stack base */ stack_base = vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword32(vectp++, 0); suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword32(vectp++, (uint32_t)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword32(vectp, 0); return ((register_t *)stack_base); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_sigsize = LINUX_SIGTBLSZ, .sv_sigtbl = bsd_to_linux_signal, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = linux_sigcode, .sv_szsigcode = &linux_szsigcode, .sv_prepsyscall = NULL, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, + .sv_thread_detach = linux_thread_detach, }; INIT_SYSENTVEC(elf_sysvec, &elf_linux_sysvec); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; struct linux_device_handler **ldhp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); - mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); - sx_init(&emul_shared_lock, "emuldata->shared lock"); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit, NULL, 1000); linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec, NULL, 1000); + linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, + linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); linux_szplatform = roundup(strlen(linux_platform) + 1, sizeof(char *)); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); - mtx_destroy(&emul_lock); - sx_destroy(&emul_shared_lock); mtx_destroy(&futex_mtx); EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); + EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return EOPNOTSUPP; } return error; } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); Index: head/sys/compat/linux/check_error.d =================================================================== --- head/sys/compat/linux/check_error.d (revision 283382) +++ head/sys/compat/linux/check_error.d (revision 283383) @@ -1,144 +1,144 @@ #!/usr/sbin/dtrace -qs /*- * Copyright (c) 2008-2012 Alexander Leidinger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Report error conditions: * - emulation errors (unsupportet stuff, unknown stuff, ...) * - kernel errors (resource shortage, ...) * - programming errors (errors which can happen, but should not happen) */ linuxulator*:dummy::not_implemented, -linuxulator*:emul:proc_exit:child_clear_tid_error, -linuxulator*:emul:proc_exit:futex_failed, +linuxulator*:emul:linux_thread_detach:child_clear_tid_error, +linuxulator*:emul:linux_thread_detach:futex_failed, linuxulator*:emul:linux_schedtail:copyout_error, linuxulator*:futex:futex_get:error, linuxulator*:futex:futex_sleep:requeue_error, linuxulator*:futex:futex_sleep:sleep_error, linuxulator*:futex:futex_wait:copyin_error, linuxulator*:futex:futex_wait:itimerfix_error, linuxulator*:futex:futex_wait:sleep_error, linuxulator*:futex:futex_atomic_op:missing_access_check, linuxulator*:futex:futex_atomic_op:unimplemented_op, linuxulator*:futex:futex_atomic_op:unimplemented_cmp, linuxulator*:futex:linux_sys_futex:unimplemented_clockswitch, linuxulator*:futex:linux_sys_futex:copyin_error, linuxulator*:futex:linux_sys_futex:unhandled_efault, linuxulator*:futex:linux_sys_futex:unimplemented_lock_pi, linuxulator*:futex:linux_sys_futex:unimplemented_unlock_pi, linuxulator*:futex:linux_sys_futex:unimplemented_trylock_pi, linuxulator*:futex:linux_sys_futex:unimplemented_wait_requeue_pi, linuxulator*:futex:linux_sys_futex:unimplemented_cmp_requeue_pi, linuxulator*:futex:linux_sys_futex:unknown_operation, linuxulator*:futex:linux_get_robust_list:copyout_error, linuxulator*:futex:handle_futex_death:copyin_error, linuxulator*:futex:fetch_robust_entry:copyin_error, linuxulator*:futex:release_futexes:copyin_error, linuxulator*:time:linux_clock_gettime:conversion_error, linuxulator*:time:linux_clock_gettime:gettime_error, linuxulator*:time:linux_clock_gettime:copyout_error, linuxulator*:time:linux_clock_settime:conversion_error, linuxulator*:time:linux_clock_settime:settime_error, linuxulator*:time:linux_clock_settime:copyin_error, linuxulator*:time:linux_clock_getres:conversion_error, linuxulator*:time:linux_clock_getres:getres_error, linuxulator*:time:linux_clock_getres:copyout_error, linuxulator*:time:linux_nanosleep:conversion_error, linuxulator*:time:linux_nanosleep:nanosleep_error, linuxulator*:time:linux_nanosleep:copyout_error, linuxulator*:time:linux_nanosleep:copyin_error, linuxulator*:time:linux_clock_nanosleep:copyin_error, linuxulator*:time:linux_clock_nanosleep:conversion_error, linuxulator*:time:linux_clock_nanosleep:copyout_error, linuxulator*:time:linux_clock_nanosleep:nanosleep_error, linuxulator*:sysctl:handle_string:copyout_error, linuxulator*:sysctl:linux_sysctl:copyin_error, linuxulator*:mib:linux_sysctl_osname:sysctl_string_error, linuxulator*:mib:linux_sysctl_osrelease:sysctl_string_error, linuxulator*:mib:linux_sysctl_oss_version:sysctl_string_error, linuxulator*:mib:linux_prison_create:vfs_copyopt_error, linuxulator*:mib:linux_prison_check:vfs_copyopt_error, linuxulator*:mib:linux_prison_check:vfs_getopt_error, linuxulator*:mib:linux_prison_set:vfs_copyopt_error, linuxulator*:mib:linux_prison_set:vfs_getopt_error, linuxulator*:mib:linux_prison_get:vfs_setopt_error, linuxulator*:mib:linux_prison_get:vfs_setopts_error { printf("ERROR: %s in %s:%s:%s\n", probename, probeprov, probemod, probefunc); stack(); ustack(); } linuxulator*:util:linux_driver_get_name_dev:nullcall, linuxulator*:util:linux_driver_get_major_minor:nullcall, linuxulator*:futex:linux_sys_futex:invalid_cmp_requeue_use, linuxulator*:futex:linux_sys_futex:deprecated_requeue, linuxulator*:futex:linux_set_robust_list:size_error, linuxulator*:time:linux_clock_getres:nullcall { printf("WARNING: %s:%s:%s:%s in application %s, maybe an application error?\n", probename, probeprov, probemod, probefunc, execname); stack(); ustack(); } linuxulator*:util:linux_driver_get_major_minor:notfound { printf("WARNING: Application %s failed to find %s in %s:%s:%s, this may or may not be a problem.\n", execname, stringof(args[0]), probename, probeprov, probemod); stack(); ustack(); } linuxulator*:time:linux_to_native_clockid:unknown_clockid { printf("INFO: Application %s tried to use unknown clockid %d. Please report this to freebsd-emulation@FreeBSD.org.\n", execname, arg0); } linuxulator*:time:linux_to_native_clockid:unsupported_clockid, linuxulator*:time:linux_clock_nanosleep:unsupported_clockid { printf("WARNING: Application %s tried to use unsupported clockid (%d), this may or may not be a problem for the application.\nPatches to support this clockid are welcome on the freebsd-emulation@FreeBSD.org mailinglist.\n", execname, arg0); } linuxulator*:time:linux_clock_nanosleep:unsupported_flags { printf("WARNING: Application %s tried to use unsupported flags (%d), this may or may not be a problem for the application.\nPatches to support those flags are welcome on the freebsd-emulation@FreeBSD.org mailinglist.\n", execname, arg0); } linuxulator*:sysctl:linux_sysctl:wrong_length { printf("ERROR: Application %s issued a sysctl which failed the length restrictions.\nThe length passed is %d, the min length supported is 1 and the max length supported is %d.\n", execname, arg0, arg1); stack(); ustack(); } linuxulator*:sysctl:linux_sysctl:unsupported_sysctl { printf("ERROR: Application %s issued an unsupported sysctl (%s).\nPatches to support this sysctl are welcome on the freebsd-emulation@FreeBSD.org mailinglist.\n", execname, stringof(args[0])); } Index: head/sys/compat/linux/check_internal_locks.d =================================================================== --- head/sys/compat/linux/check_internal_locks.d (revision 283382) +++ head/sys/compat/linux/check_internal_locks.d (revision 283383) @@ -1,132 +1,97 @@ #!/usr/sbin/dtrace -qs /*- * Copyright (c) 2008-2012 Alexander Leidinger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /** * Check if the internal locks are correctly acquired/released: * - no recursive locking (mtx locks, write locks) * - no unlocking of already unlocked one * * Print stacktrace if a lock is longer locked than about 10sec or more. */ #pragma D option dynvarsize=32m #pragma D option specsize=32m BEGIN { - check["emul_lock"] = 0; - check["emul_shared_rlock"] = 0; - check["emul_shared_wlock"] = 0; check["futex_mtx"] = 0; } -linuxulator*:locks:emul_lock:locked, -linuxulator*:locks:emul_shared_wlock:locked, linuxulator*:locks:futex_mtx:locked /check[probefunc] > 0/ { printf("ERROR: recursive lock of %s (%p),", probefunc, arg0); printf(" or missing SDT probe in kernel. Stack trace follows:"); stack(); } -linuxulator*:locks:emul_lock:locked, -linuxulator*:locks:emul_shared_rlock:locked, -linuxulator*:locks:emul_shared_wlock:locked, linuxulator*:locks:futex_mtx:locked { ++check[probefunc]; @stats[probefunc] = count(); ts[probefunc] = timestamp; spec[probefunc] = speculation(); } -linuxulator*:locks:emul_lock:unlock, -linuxulator*:locks:emul_shared_rlock:unlock, -linuxulator*:locks:emul_shared_wlock:unlock, linuxulator*:locks:futex_mtx:unlock /check[probefunc] == 0/ { printf("ERROR: unlock attemt of unlocked %s (%p),", probefunc, arg0); printf(" missing SDT probe in kernel, or dtrace program started"); printf(" while the %s was already held (race condition).", probefunc); printf(" Stack trace follows:"); stack(); } -linuxulator*:locks:emul_lock:unlock, -linuxulator*:locks:emul_shared_rlock:unlock, -linuxulator*:locks:emul_shared_wlock:unlock, linuxulator*:locks:futex_mtx:unlock { discard(spec[probefunc]); spec[probefunc] = 0; --check[probefunc]; } /* Timeout handling */ - -tick-10s -/spec["emul_lock"] != 0 && timestamp - ts["emul_lock"] >= 9999999000/ -{ - commit(spec["emul_lock"]); - spec["emul_lock"] = 0; -} - -tick-10s -/spec["emul_shared_wlock"] != 0 && timestamp - ts["emul_shared_wlock"] >= 9999999000/ -{ - commit(spec["emul_shared_wlock"]); - spec["emul_shared_wlock"] = 0; -} - -tick-10s -/spec["emul_shared_rlock"] != 0 && timestamp - ts["emul_shared_rlock"] >= 9999999000/ -{ - commit(spec["emul_shared_rlock"]); - spec["emul_shared_rlock"] = 0; -} tick-10s /spec["futex_mtx"] != 0 && timestamp - ts["futex_mtx"] >= 9999999000/ { commit(spec["futex_mtx"]); spec["futex_mtx"] = 0; } /* Statistics */ END { printf("Number of locks per type:"); printa(@stats); } Index: head/sys/compat/linux/linux_emul.c =================================================================== --- head/sys/compat/linux/linux_emul.c (revision 283382) +++ head/sys/compat/linux/linux_emul.c (revision 283383) @@ -1,473 +1,357 @@ /*- * Copyright (c) 2006 Roman Divacky + * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include +#include /** * Special DTrace provider for the linuxulator. * * In this file we define the provider for the entire linuxulator. All * modules (= files of the linuxulator) use it. * * We define a different name depending on the emulated bitsize, see * ../..//linux{,32}/linux.h, e.g.: * native bitsize = linuxulator * amd64, 32bit emulation = linuxulator32 */ LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE); /** - * Special DTrace module "locks", it covers some linuxulator internal - * locks. - */ -LIN_SDT_PROBE_DEFINE1(locks, emul_lock, locked, "struct mtx *"); -LIN_SDT_PROBE_DEFINE1(locks, emul_lock, unlock, "struct mtx *"); -LIN_SDT_PROBE_DEFINE1(locks, emul_shared_rlock, locked, "struct sx *"); -LIN_SDT_PROBE_DEFINE1(locks, emul_shared_rlock, unlock, "struct sx *"); -LIN_SDT_PROBE_DEFINE1(locks, emul_shared_wlock, locked, "struct sx *"); -LIN_SDT_PROBE_DEFINE1(locks, emul_shared_wlock, unlock, "struct sx *"); - -/** * DTrace probes in this module. */ -LIN_SDT_PROBE_DEFINE2(emul, em_find, entry, "struct proc *", "int"); +LIN_SDT_PROBE_DEFINE1(emul, em_find, entry, "struct thread *"); LIN_SDT_PROBE_DEFINE0(emul, em_find, return); -LIN_SDT_PROBE_DEFINE3(emul, proc_init, entry, "struct thread *", "pid_t", - "int"); +LIN_SDT_PROBE_DEFINE3(emul, proc_init, entry, "struct thread *", + "struct thread *", "int"); LIN_SDT_PROBE_DEFINE0(emul, proc_init, create_thread); LIN_SDT_PROBE_DEFINE0(emul, proc_init, fork); LIN_SDT_PROBE_DEFINE0(emul, proc_init, exec); LIN_SDT_PROBE_DEFINE0(emul, proc_init, return); LIN_SDT_PROBE_DEFINE1(emul, proc_exit, entry, "struct proc *"); -LIN_SDT_PROBE_DEFINE0(emul, proc_exit, futex_failed); -LIN_SDT_PROBE_DEFINE3(emul, proc_exit, reparent, "pid_t", "pid_t", - "struct proc *"); -LIN_SDT_PROBE_DEFINE1(emul, proc_exit, child_clear_tid_error, "int"); -LIN_SDT_PROBE_DEFINE0(emul, proc_exit, return); +LIN_SDT_PROBE_DEFINE1(emul, linux_thread_detach, entry, "struct thread *"); +LIN_SDT_PROBE_DEFINE0(emul, linux_thread_detach, futex_failed); +LIN_SDT_PROBE_DEFINE1(emul, linux_thread_detach, child_clear_tid_error, "int"); +LIN_SDT_PROBE_DEFINE0(emul, linux_thread_detach, return); LIN_SDT_PROBE_DEFINE2(emul, proc_exec, entry, "struct proc *", "struct image_params *"); LIN_SDT_PROBE_DEFINE0(emul, proc_exec, return); LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, entry); LIN_SDT_PROBE_DEFINE1(emul, linux_schedtail, copyout_error, "int"); LIN_SDT_PROBE_DEFINE0(emul, linux_schedtail, return); LIN_SDT_PROBE_DEFINE1(emul, linux_set_tid_address, entry, "int *"); LIN_SDT_PROBE_DEFINE0(emul, linux_set_tid_address, return); -LIN_SDT_PROBE_DEFINE2(emul, linux_kill_threads, entry, "struct thread *", - "int"); -LIN_SDT_PROBE_DEFINE1(emul, linux_kill_threads, kill, "pid_t"); -LIN_SDT_PROBE_DEFINE0(emul, linux_kill_threads, return); -struct sx emul_shared_lock; -struct mtx emul_lock; - -/* this returns locked reference to the emuldata entry (if found) */ +/* + * This returns reference to the emuldata entry (if found) + * + * Hold PROC_LOCK when referencing emuldata from other threads. + */ struct linux_emuldata * -em_find(struct proc *p, int locked) +em_find(struct thread *td) { struct linux_emuldata *em; - LIN_SDT_PROBE2(emul, em_find, entry, p, locked); + LIN_SDT_PROBE1(emul, em_find, entry, td); - if (locked == EMUL_DOLOCK) - EMUL_LOCK(&emul_lock); + em = td->td_emuldata; - em = p->p_emuldata; - - if (em == NULL && locked == EMUL_DOLOCK) - EMUL_UNLOCK(&emul_lock); - LIN_SDT_PROBE1(emul, em_find, return, em); + return (em); } -int -linux_proc_init(struct thread *td, pid_t child, int flags) +void +linux_proc_init(struct thread *td, struct thread *newtd, int flags) { - struct linux_emuldata *em, *p_em; - struct proc *p; + struct linux_emuldata *em; - LIN_SDT_PROBE3(emul, proc_init, entry, td, child, flags); + LIN_SDT_PROBE3(emul, proc_init, entry, td, newtd, flags); - if (child != 0) { - /* fork or create a thread */ - em = malloc(sizeof *em, M_LINUX, M_WAITOK | M_ZERO); - em->pid = child; + if (newtd != NULL) { + /* non-exec call */ + em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO); em->pdeath_signal = 0; em->flags = 0; em->robust_futexes = NULL; if (flags & LINUX_CLONE_THREAD) { - /* handled later in the code */ LIN_SDT_PROBE0(emul, proc_init, create_thread); - } else { - struct linux_emuldata_shared *s; + em->em_tid = newtd->td_tid; + } else { LIN_SDT_PROBE0(emul, proc_init, fork); - s = malloc(sizeof *s, M_LINUX, M_WAITOK | M_ZERO); - s->refs = 1; - s->group_pid = child; - - LIST_INIT(&s->threads); - em->shared = s; + em->em_tid = newtd->td_proc->p_pid; } + newtd->td_emuldata = em; } else { /* exec */ LIN_SDT_PROBE0(emul, proc_init, exec); /* lookup the old one */ - em = em_find(td->td_proc, EMUL_DOLOCK); + em = em_find(td); KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n")); + + em->em_tid = td->td_proc->p_pid; } em->child_clear_tid = NULL; em->child_set_tid = NULL; + LIN_SDT_PROBE0(emul, proc_init, return); +} + +void +linux_proc_exit(void *arg __unused, struct proc *p) +{ + struct thread *td = curthread; + + if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX)) { + LIN_SDT_PROBE1(emul, proc_exit, entry, p); + (p->p_sysent->sv_thread_detach)(td); + } +} + +int +linux_common_execve(struct thread *td, struct image_args *eargs) +{ + struct linux_emuldata *em; + struct proc *p; + int error; + + p = td->td_proc; + /* - * allocate the shared struct only in clone()/fork cases in the case - * of clone() td = calling proc and child = pid of the newly created - * proc + * Unlike FreeBSD abort all other threads before + * proceeding exec. */ - if (child != 0) { - if (flags & LINUX_CLONE_THREAD) { - /* lookup the parent */ - /* - * we dont have to lock the p_em because - * its waiting for us in linux_clone so - * there is no chance of it changing the - * p_em->shared address - */ - p_em = em_find(td->td_proc, EMUL_DONTLOCK); - KASSERT(p_em != NULL, ("proc_init: parent emuldata not found for CLONE_THREAD\n")); - em->shared = p_em->shared; - EMUL_SHARED_WLOCK(&emul_shared_lock); - em->shared->refs++; - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - } else { - /* - * handled earlier to avoid malloc(M_WAITOK) with - * rwlock held - */ - } + PROC_LOCK(p); + /* See exit1() comments. */ + thread_suspend_check(0); + while (p->p_flag & P_HADTHREADS) { + if (!thread_single(p, SINGLE_EXIT)) + break; + thread_suspend_check(0); + } + PROC_UNLOCK(p); - EMUL_SHARED_WLOCK(&emul_shared_lock); - LIST_INSERT_HEAD(&em->shared->threads, em, threads); - EMUL_SHARED_WUNLOCK(&emul_shared_lock); + error = kern_execve(td, eargs, NULL); + if (error != 0) + return (error); - p = pfind(child); - KASSERT(p != NULL, ("process not found in proc_init\n")); - p->p_emuldata = em; + /* + * In a case of transition from Linux binary execing to + * FreeBSD binary we destroy linux emuldata thread entry. + */ + if (SV_CURPROC_ABI() != SV_ABI_LINUX) { + PROC_LOCK(p); + em = em_find(td); + KASSERT(em != NULL, ("proc_exec: emuldata not found.\n")); + td->td_emuldata = NULL; PROC_UNLOCK(p); - } else - EMUL_UNLOCK(&emul_lock); - LIN_SDT_PROBE0(emul, proc_init, return); + free(em, M_TEMP); + } return (0); } +void +linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) +{ + struct thread *td = curthread; + + /* + * In a case of execing to linux binary we create linux + * emuldata thread entry. + */ + if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) == + SV_ABI_LINUX)) { + LIN_SDT_PROBE2(emul, proc_exec, entry, p, imgp); + if (SV_PROC_ABI(p) == SV_ABI_LINUX) + linux_proc_init(td, NULL, 0); + else + linux_proc_init(td, td, 0); + + LIN_SDT_PROBE0(emul, proc_exec, return); + } +} + void -linux_proc_exit(void *arg __unused, struct proc *p) +linux_thread_detach(struct thread *td) { + struct linux_sys_futex_args cup; struct linux_emuldata *em; - int error, shared_flags, shared_xstat; - struct thread *td = FIRST_THREAD_IN_PROC(p); int *child_clear_tid; - struct proc *q, *nq; + int null = 0; + int error; - if (__predict_true(p->p_sysent != &elf_linux_sysvec)) - return; + LIN_SDT_PROBE1(emul, linux_thread_detach, entry, td); - LIN_SDT_PROBE1(emul, proc_exit, entry, p); + em = em_find(td); + KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); - release_futexes(p); + LINUX_CTR1(exit, "thread detach(%d)", em->em_tid); - /* find the emuldata */ - em = em_find(p, EMUL_DOLOCK); + release_futexes(td, em); - KASSERT(em != NULL, ("proc_exit: emuldata not found.\n")); + child_clear_tid = em->child_clear_tid; - /* reparent all procs that are not a thread leader to initproc */ - if (em->shared->group_pid != p->p_pid) { - LIN_SDT_PROBE3(emul, proc_exit, reparent, - em->shared->group_pid, p->p_pid, p); - - child_clear_tid = em->child_clear_tid; - EMUL_UNLOCK(&emul_lock); - sx_xlock(&proctree_lock); - wakeup(initproc); - PROC_LOCK(p); - proc_reparent(p, initproc); - p->p_sigparent = SIGCHLD; - PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); - } else { - child_clear_tid = em->child_clear_tid; - EMUL_UNLOCK(&emul_lock); - } - - EMUL_SHARED_WLOCK(&emul_shared_lock); - shared_flags = em->shared->flags; - shared_xstat = em->shared->xstat; - LIST_REMOVE(em, threads); - - em->shared->refs--; - if (em->shared->refs == 0) { - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - free(em->shared, M_LINUX); - } else - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - - if ((shared_flags & EMUL_SHARED_HASXSTAT) != 0) - p->p_xstat = shared_xstat; - if (child_clear_tid != NULL) { - struct linux_sys_futex_args cup; - int null = 0; + LINUX_CTR2(exit, "thread detach(%d) %p", + em->em_tid, child_clear_tid); + error = copyout(&null, child_clear_tid, sizeof(null)); if (error) { - LIN_SDT_PROBE1(emul, proc_exit, + LIN_SDT_PROBE1(emul, linux_thread_detach, child_clear_tid_error, error); - free(em, M_LINUX); - - LIN_SDT_PROBE0(emul, proc_exit, return); + LIN_SDT_PROBE0(emul, linux_thread_detach, return); return; } - /* futexes stuff */ cup.uaddr = child_clear_tid; cup.op = LINUX_FUTEX_WAKE; cup.val = 0x7fffffff; /* Awake everyone */ cup.timeout = NULL; cup.uaddr2 = NULL; cup.val3 = 0; - error = linux_sys_futex(FIRST_THREAD_IN_PROC(p), &cup); + error = linux_sys_futex(td, &cup); /* * this cannot happen at the moment and if this happens it * probably means there is a user space bug */ if (error) { - LIN_SDT_PROBE0(emul, proc_exit, futex_failed); - printf(LMSG("futex stuff in proc_exit failed.\n")); + LIN_SDT_PROBE0(emul, linux_thread_detach, futex_failed); + printf(LMSG("futex stuff in thread_detach failed.\n")); } } - /* clean the stuff up */ - free(em, M_LINUX); - - /* this is a little weird but rewritten from exit1() */ - sx_xlock(&proctree_lock); - q = LIST_FIRST(&p->p_children); - for (; q != NULL; q = nq) { - nq = LIST_NEXT(q, p_sibling); - if (q->p_flag & P_WEXIT) - continue; - if (__predict_false(q->p_sysent != &elf_linux_sysvec)) - continue; - em = em_find(q, EMUL_DOLOCK); - KASSERT(em != NULL, ("linux_reparent: emuldata not found: %i\n", q->p_pid)); - PROC_LOCK(q); - if ((q->p_flag & P_WEXIT) == 0 && em->pdeath_signal != 0) { - kern_psignal(q, em->pdeath_signal); - } - PROC_UNLOCK(q); - EMUL_UNLOCK(&emul_lock); - } - sx_xunlock(&proctree_lock); - - LIN_SDT_PROBE0(emul, proc_exit, return); + LIN_SDT_PROBE0(emul, linux_thread_detach, return); } -/* - * This is used in a case of transition from FreeBSD binary execing to linux binary - * in this case we create linux emuldata proc entry with the pid of the currently running - * process. - */ -void -linux_proc_exec(void *arg __unused, struct proc *p, struct image_params *imgp) +void +linux_thread_dtor(void *arg __unused, struct thread *td) { - if (__predict_false(imgp->sysent == &elf_linux_sysvec)) { - LIN_SDT_PROBE2(emul, proc_exec, entry, p, imgp); - } - if (__predict_false(imgp->sysent == &elf_linux_sysvec - && p->p_sysent != &elf_linux_sysvec)) - linux_proc_init(FIRST_THREAD_IN_PROC(p), p->p_pid, 0); - if (__predict_false((p->p_sysent->sv_flags & SV_ABI_MASK) == - SV_ABI_LINUX)) - /* Kill threads regardless of imgp->sysent value */ - linux_kill_threads(FIRST_THREAD_IN_PROC(p), SIGKILL); - if (__predict_false(imgp->sysent != &elf_linux_sysvec - && p->p_sysent == &elf_linux_sysvec)) { - struct linux_emuldata *em; + struct linux_emuldata *em; - /* - * XXX:There's a race because here we assign p->p_emuldata NULL - * but the process is still counted as linux one for a short - * time so some other process might reference it and try to - * access its p->p_emuldata and panicing on a NULL reference. - */ - em = em_find(p, EMUL_DONTLOCK); + em = em_find(td); + if (em == NULL) + return; + td->td_emuldata = NULL; - KASSERT(em != NULL, ("proc_exec: emuldata not found.\n")); + LINUX_CTR1(exit, "thread dtor(%d)", em->em_tid); - EMUL_SHARED_WLOCK(&emul_shared_lock); - LIST_REMOVE(em, threads); - - PROC_LOCK(p); - p->p_emuldata = NULL; - PROC_UNLOCK(p); - - em->shared->refs--; - if (em->shared->refs == 0) { - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - free(em->shared, M_LINUX); - } else - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - - free(em, M_LINUX); - } - - if (__predict_false(imgp->sysent == &elf_linux_sysvec)) { - LIN_SDT_PROBE0(emul, proc_exec, return); - } + free(em, M_TEMP); } void linux_schedtail(struct thread *td) { struct linux_emuldata *em; struct proc *p; int error = 0; int *child_set_tid; + LIN_SDT_PROBE1(emul, linux_schedtail, entry, td); + p = td->td_proc; - LIN_SDT_PROBE1(emul, linux_schedtail, entry, p); - - /* find the emuldata */ - em = em_find(p, EMUL_DOLOCK); - + em = em_find(td); KASSERT(em != NULL, ("linux_schedtail: emuldata not found.\n")); child_set_tid = em->child_set_tid; - EMUL_UNLOCK(&emul_lock); if (child_set_tid != NULL) { - error = copyout(&p->p_pid, (int *)child_set_tid, - sizeof(p->p_pid)); + error = copyout(&em->em_tid, (int *)child_set_tid, + sizeof(em->em_tid)); + LINUX_CTR4(clone, "schedtail(%d) %p stored %d error %d", + td->td_tid, child_set_tid, em->em_tid, error); if (error != 0) { LIN_SDT_PROBE1(emul, linux_schedtail, copyout_error, error); } - } + } else + LINUX_CTR1(clone, "schedtail(%d)", em->em_tid); LIN_SDT_PROBE0(emul, linux_schedtail, return); - - return; } int linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) { struct linux_emuldata *em; LIN_SDT_PROBE1(emul, linux_set_tid_address, entry, args->tidptr); - /* find the emuldata */ - em = em_find(td->td_proc, EMUL_DOLOCK); - + em = em_find(td); KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); em->child_clear_tid = args->tidptr; - td->td_retval[0] = td->td_proc->p_pid; - EMUL_UNLOCK(&emul_lock); + td->td_retval[0] = em->em_tid; - LIN_SDT_PROBE0(emul, linux_set_tid_address, return); - return 0; -} + LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", + em->em_tid, args->tidptr, td->td_retval[0]); -void -linux_kill_threads(struct thread *td, int sig) -{ - struct linux_emuldata *em, *td_em, *tmp_em; - struct proc *sp; - - LIN_SDT_PROBE2(emul, linux_kill_threads, entry, td, sig); - - td_em = em_find(td->td_proc, EMUL_DONTLOCK); - - KASSERT(td_em != NULL, ("linux_kill_threads: emuldata not found.\n")); - - EMUL_SHARED_RLOCK(&emul_shared_lock); - LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) { - if (em->pid == td_em->pid) - continue; - - sp = pfind(em->pid); - if ((sp->p_flag & P_WEXIT) == 0) - kern_psignal(sp, sig); - PROC_UNLOCK(sp); - - LIN_SDT_PROBE1(emul, linux_kill_threads, kill, em->pid); - } - EMUL_SHARED_RUNLOCK(&emul_shared_lock); - - LIN_SDT_PROBE0(emul, linux_kill_threads, return); + LIN_SDT_PROBE0(emul, linux_set_tid_address, return); + return (0); } Index: head/sys/compat/linux/linux_emul.h =================================================================== --- head/sys/compat/linux/linux_emul.h (revision 283382) +++ head/sys/compat/linux/linux_emul.h (revision 283383) @@ -1,121 +1,64 @@ /*- * Copyright (c) 2006 Roman Divacky + * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_EMUL_H_ #define _LINUX_EMUL_H_ -#define EMUL_SHARED_HASXSTAT 0x01 - -struct linux_emuldata_shared { - int refs; - int flags; - int xstat; - pid_t group_pid; - - LIST_HEAD(, linux_emuldata) threads; /* head of list of linux threads */ -}; - /* * modeled after similar structure in NetBSD * this will be extended as we need more functionality */ struct linux_emuldata { - pid_t pid; - int *child_set_tid; /* in clone(): Child's TID to set on clone */ int *child_clear_tid;/* in clone(): Child's TID to clear on exit */ - struct linux_emuldata_shared *shared; - int pdeath_signal; /* parent death signal */ int flags; /* different emuldata flags */ + int em_tid; /* thread id */ struct linux_robust_list_head *robust_futexes; - - LIST_ENTRY(linux_emuldata) threads; /* list of linux threads */ }; -struct linux_emuldata *em_find(struct proc *, int locked); +struct linux_emuldata *em_find(struct thread *); -/* - * DTrace probes for locks should be fired after locking and before releasing - * to prevent races (to provide data/function stability in dtrace, see the - * output of "dtrace -v ..." and the corresponding dtrace docs). - */ -#define EMUL_LOCK(l) do { \ - mtx_lock(l); \ - LIN_SDT_PROBE1(locks, emul_lock, \ - locked, l); \ - } while (0) -#define EMUL_UNLOCK(l) do { \ - LIN_SDT_PROBE1(locks, emul_lock, \ - unlock, l); \ - mtx_unlock(l); \ - } while (0) - -#define EMUL_SHARED_RLOCK(l) do { \ - sx_slock(l); \ - LIN_SDT_PROBE1(locks, emul_shared_rlock, \ - locked, l); \ - } while (0) -#define EMUL_SHARED_RUNLOCK(l) do { \ - LIN_SDT_PROBE1(locks, emul_shared_rlock, \ - unlock, l); \ - sx_sunlock(l); \ - } while (0) -#define EMUL_SHARED_WLOCK(l) do { \ - sx_xlock(l); \ - LIN_SDT_PROBE1(locks, emul_shared_wlock, \ - locked, l); \ - } while (0) -#define EMUL_SHARED_WUNLOCK(l) do { \ - LIN_SDT_PROBE1(locks, emul_shared_wlock, \ - unlock, l); \ - sx_xunlock(l); \ - } while (0) - -/* for em_find use */ -#define EMUL_DOLOCK 1 -#define EMUL_DONTLOCK 0 - /* emuldata flags */ #define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated futex REQUEUE op*/ -int linux_proc_init(struct thread *, pid_t, int); +void linux_proc_init(struct thread *, struct thread *, int); void linux_proc_exit(void *, struct proc *); void linux_schedtail(struct thread *); void linux_proc_exec(void *, struct proc *, struct image_params *); -void linux_kill_threads(struct thread *, int); - -extern struct sx emul_shared_lock; -extern struct mtx emul_lock; +void linux_thread_dtor(void *arg __unused, struct thread *); +void linux_thread_detach(struct thread *); +int linux_common_execve(struct thread *, struct image_args *); #endif /* !_LINUX_EMUL_H_ */ Index: head/sys/compat/linux/linux_fork.c =================================================================== --- head/sys/compat/linux/linux_fork.c (revision 283382) +++ head/sys/compat/linux/linux_fork.c (revision 283383) @@ -1,313 +1,406 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include +#include #include #include #include +#include #include -#include +#include #include #include #include +#include +#include +#include + #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif -#include #include #include #include +#include -/* DTrace init */ -LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); - -/* Linuxulator-global DTrace probes */ -LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked); -LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock); - - int linux_fork(struct thread *td, struct linux_fork_args *args) { int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(fork)) printf(ARGS(fork, "")); #endif if ((error = fork1(td, RFFDG | RFPROC | RFSTOPPED, 0, &p2, NULL, 0)) != 0) return (error); - td->td_retval[0] = p2->p_pid; - td->td_retval[1] = 0; + td2 = FIRST_THREAD_IN_PROC(p2); - error = linux_proc_init(td, td->td_retval[0], 0); - if (error) - return (error); + linux_proc_init(td, td2, 0); - td2 = FIRST_THREAD_IN_PROC(p2); + td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); return (0); } int linux_vfork(struct thread *td, struct linux_vfork_args *args) { int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(vfork)) printf(ARGS(vfork, "")); #endif /* Exclude RFPPWAIT */ if ((error = fork1(td, RFFDG | RFPROC | RFMEM | RFSTOPPED, 0, &p2, NULL, 0)) != 0) return (error); - td->td_retval[0] = p2->p_pid; - error = linux_proc_init(td, td->td_retval[0], 0); - if (error) - return (error); + td2 = FIRST_THREAD_IN_PROC(p2); + linux_proc_init(td, td2, 0); + PROC_LOCK(p2); p2->p_flag |= P_PPWAIT; PROC_UNLOCK(p2); - td2 = FIRST_THREAD_IN_PROC(p2); + td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); return (0); } -int -linux_clone(struct thread *td, struct linux_clone_args *args) +static int +linux_clone_proc(struct thread *td, struct linux_clone_args *args) { int error, ff = RFPROC | RFSTOPPED; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { if (exit_signal <= LINUX_SIGTBLSZ) exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; /* * XXX: In Linux, sharing of fs info (chroot/cwd/umask) * and open files is independant. In FreeBSD, its in one * structure but in reality it does not cause any problems * because both of these flags are usually set together. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; - /* - * Attempt to detect when linux_clone(2) is used for creating - * kernel threads. Unfortunately despite the existence of the - * CLONE_THREAD flag, version of linuxthreads package used in - * most popular distros as of beginning of 2005 doesn't make - * any use of it. Therefore, this detection relies on - * empirical observation that linuxthreads sets certain - * combination of flags, so that we can make more or less - * precise detection and notify the FreeBSD kernel that several - * processes are in fact part of the same threading group, so - * that special treatment is necessary for signal delivery - * between those processes and fd locking. - */ - if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) - ff |= RFTHREAD; - if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); error = fork1(td, ff, 0, &p2, NULL, 0); if (error) return (error); - if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { - sx_xlock(&proctree_lock); - PROC_LOCK(p2); - proc_reparent(p2, td->td_proc->p_pptr); - PROC_UNLOCK(p2); - sx_xunlock(&proctree_lock); - } + td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ - error = linux_proc_init(td, p2->p_pid, args->flags); - /* reference it - no need to check this */ - em = em_find(p2, EMUL_DOLOCK); - KASSERT(em != NULL, ("clone: emuldata not found.")); - /* and adjust it */ + linux_proc_init(td, td2, args->flags); - if (args->flags & LINUX_CLONE_THREAD) { -#ifdef notyet - PROC_LOCK(p2); - p2->p_pgrp = td->td_proc->p_pgrp; - PROC_UNLOCK(p2); -#endif - exit_signal = 0; - } + em = em_find(td2); + KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; - EMUL_UNLOCK(&emul_lock); - if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) printf(LMSG("copyout failed!")); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); - td2 = FIRST_THREAD_IN_PROC(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ - if (args->stack) - linux_set_upcall_kse(td2, PTROUT(args->stack)); + linux_set_upcall_kse(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); #ifdef DEBUG if (ldebug(clone)) printf(LMSG("clone: successful rfork to %d, " "stack %p sig = %d"), (int)p2->p_pid, args->stack, exit_signal); #endif + if (args->flags & LINUX_CLONE_VFORK) { PROC_LOCK(p2); p2->p_flag |= P_PPWAIT; PROC_UNLOCK(p2); } /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); td->td_retval[0] = p2->p_pid; - td->td_retval[1] = 0; if (args->flags & LINUX_CLONE_VFORK) { /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); } return (0); } -int -linux_exit(struct thread *td, struct linux_exit_args *args) +static int +linux_clone_thread(struct thread *td, struct linux_clone_args *args) { + struct linux_emuldata *em; + struct thread *newtd; + struct proc *p; + int error; #ifdef DEBUG - if (ldebug(exit)) - printf(ARGS(exit, "%d"), args->rval); + if (ldebug(clone)) { + printf(ARGS(clone, "thread: flags %x, stack %p, parent tid: %p, " + "child tid: %p"), (unsigned)args->flags, + args->stack, args->parent_tidptr, args->child_tidptr); + } #endif + LINUX_CTR4(clone, "thread(%d) flags %x ptid %p ctid %p", + td->td_tid, (unsigned)args->flags, + args->parent_tidptr, args->child_tidptr); + + if (args->flags & LINUX_CLONE_PARENT_SETTID) + if (args->parent_tidptr == NULL) + return (EINVAL); + + /* Threads should be created with own stack */ + if (args->stack == NULL) + return (EINVAL); + + p = td->td_proc; + + /* Initialize our td */ + error = kern_thr_alloc(p, 0, &newtd); + if (error) + return (error); + + cpu_set_upcall(newtd, td); + + bzero(&newtd->td_startzero, + __rangeof(struct thread, td_startzero, td_endzero)); + bcopy(&td->td_startcopy, &newtd->td_startcopy, + __rangeof(struct thread, td_startcopy, td_endcopy)); + + newtd->td_proc = p; + newtd->td_ucred = crhold(td->td_ucred); + + /* create the emuldata */ + linux_proc_init(td, newtd, args->flags); + + em = em_find(newtd); + KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); + + if (args->flags & LINUX_CLONE_SETTLS) + linux_set_cloned_tls(newtd, args->tls); + + if (args->flags & LINUX_CLONE_CHILD_SETTID) + em->child_set_tid = args->child_tidptr; + else + em->child_set_tid = NULL; + + if (args->flags & LINUX_CLONE_CHILD_CLEARTID) + em->child_clear_tid = args->child_tidptr; + else + em->child_clear_tid = NULL; + + cpu_thread_clean(newtd); + + linux_set_upcall_kse(newtd, PTROUT(args->stack)); + + PROC_LOCK(p); + p->p_flag |= P_HADTHREADS; + newtd->td_sigmask = td->td_sigmask; + bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); + + if (args->flags & LINUX_CLONE_PARENT) + thread_link(newtd, p->p_pptr); + else + thread_link(newtd, p); + + thread_lock(td); + /* let the scheduler know about these things. */ + sched_fork_thread(td, newtd); + thread_unlock(td); + if (P_SHOULDSTOP(p)) + newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; + PROC_UNLOCK(p); + + tidhash_add(newtd); + +#ifdef DEBUG + if (ldebug(clone)) + printf(ARGS(clone, "successful clone to %d, stack %p"), + (int)newtd->td_tid, args->stack); +#endif + + LINUX_CTR2(clone, "thread(%d) successful clone to %d", + td->td_tid, newtd->td_tid); + + if (args->flags & LINUX_CLONE_PARENT_SETTID) { + error = copyout(&newtd->td_tid, args->parent_tidptr, + sizeof(newtd->td_tid)); + if (error) + printf(LMSG("clone_thread: copyout failed!")); + } + + /* + * Make this runnable after we are finished with it. + */ + thread_lock(newtd); + TD_SET_CAN_RUN(newtd); + sched_add(newtd, SRQ_BORING); + thread_unlock(newtd); + + td->td_retval[0] = newtd->td_tid; + + return (0); +} + +int +linux_clone(struct thread *td, struct linux_clone_args *args) +{ + + if (args->flags & LINUX_CLONE_THREAD) + return (linux_clone_thread(td, args)); + else + return (linux_clone_proc(td, args)); +} + +int +linux_exit(struct thread *td, struct linux_exit_args *args) +{ + struct linux_emuldata *em; + + em = em_find(td); + KASSERT(em != NULL, ("exit: emuldata not found.\n")); + + LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval); + + linux_thread_detach(td); + + /* + * XXX. When the last two threads of a process + * exit via pthread_exit() try thr_exit() first. + */ + kern_thr_exit(td); exit1(td, W_EXITCODE(args->rval, 0)); /* NOTREACHED */ } Index: head/sys/compat/linux/linux_futex.c =================================================================== --- head/sys/compat/linux/linux_futex.c (revision 283382) +++ head/sys/compat/linux/linux_futex.c (revision 283383) @@ -1,1240 +1,1236 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); -/* Linuxulator-global DTrace probes */ -LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked); -LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock); - /** * Futex part for the special DTrace module "locks". */ LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); /** * Per futex probes. */ LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_put, return); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *", "struct waiting_proc **", "struct futex **"); LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *", "struct waiting_proc **", "int"); LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", "struct waiting_proc *", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int", "struct futex *", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *", "struct waiting_proc **", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *", "struct linux_sys_futex_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, itimerfix_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq, "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *", "struct linux_set_robust_list_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *", "struct linux_get_robust_list_args *"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int"); -LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct proc *", - "uint32_t *", "unsigned int"); +LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, + "struct linux_emuldata *", "uint32_t *", "unsigned int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry, "struct linux_robust_list **", "struct linux_robust_list **", "unsigned int *"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int"); -LIN_SDT_PROBE_DEFINE1(futex, release_futexes, entry, "struct proc *"); +LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *", + "struct linux_emuldata *"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return); static MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes"); static MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futexes wp"); struct futex; struct waiting_proc { uint32_t wp_flags; struct futex *wp_futex; TAILQ_ENTRY(waiting_proc) wp_list; }; struct futex { struct sx f_lck; uint32_t *f_uaddr; /* user-supplied value, for debug */ struct umtx_key f_key; uint32_t f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; }; struct futex_list futex_list; #define FUTEX_LOCK(f) sx_xlock(&(f)->f_lck) #define FUTEX_UNLOCK(f) sx_xunlock(&(f)->f_lck) #define FUTEX_INIT(f) do { \ sx_init_flags(&(f)->f_lck, "ftlk", \ SX_DUPOK); \ LIN_SDT_PROBE1(futex, futex, create, \ &(f)->f_lck); \ } while (0) #define FUTEX_DESTROY(f) do { \ LIN_SDT_PROBE1(futex, futex, destroy, \ &(f)->f_lck); \ sx_destroy(&(f)->f_lck); \ } while (0) #define FUTEX_ASSERT_LOCKED(f) sx_assert(&(f)->f_lck, SA_XLOCKED) struct mtx futex_mtx; /* protects the futex list */ #define FUTEXES_LOCK do { \ mtx_lock(&futex_mtx); \ LIN_SDT_PROBE1(locks, futex_mtx, \ locked, &futex_mtx); \ } while (0) #define FUTEXES_UNLOCK do { \ LIN_SDT_PROBE1(locks, futex_mtx, \ unlock, &futex_mtx); \ mtx_unlock(&futex_mtx); \ } while (0) /* flags for futex_get() */ #define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ #define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ #define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ #define FUTEX_SHARED 0x8 /* shared futex */ /* wp_flags */ #define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list * of futex where thread sleep to wp_list * of another futex. */ #define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex * wp_list to prevent double wakeup. */ /* support.s */ int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval); int futex_addl(int oparg, uint32_t *uaddr, int *oldval); int futex_orl(int oparg, uint32_t *uaddr, int *oldval); int futex_andl(int oparg, uint32_t *uaddr, int *oldval); int futex_xorl(int oparg, uint32_t *uaddr, int *oldval); static void futex_put(struct futex *f, struct waiting_proc *wp) { LIN_SDT_PROBE2(futex, futex_put, entry, f, wp); FUTEX_ASSERT_LOCKED(f); if (wp != NULL) { if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); free(wp, M_FUTEX_WP); } FUTEXES_LOCK; if (--f->f_refcount == 0) { LIST_REMOVE(f, f_list); FUTEXES_UNLOCK; FUTEX_UNLOCK(f); LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); umtx_key_release(&f->f_key); FUTEX_DESTROY(f); free(f, M_FUTEX); LIN_SDT_PROBE0(futex, futex_put, return); return; } LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEXES_UNLOCK; FUTEX_UNLOCK(f); LIN_SDT_PROBE0(futex, futex_put, return); } static int futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) { struct futex *f, *tmpf; struct umtx_key key; int error; LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags); *newf = tmpf = NULL; error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE, &key); if (error) { LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); LIN_SDT_PROBE1(futex, futex_get0, return, error); return (error); } retry: FUTEXES_LOCK; LIST_FOREACH(f, &futex_list, f_list) { if (umtx_key_match(&f->f_key, &key)) { if (tmpf != NULL) { FUTEX_UNLOCK(tmpf); FUTEX_DESTROY(tmpf); free(tmpf, M_FUTEX); } if (flags & FUTEX_DONTEXISTS) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, return, EINVAL); return (EINVAL); } /* * Increment refcount of the found futex to * prevent it from deallocation before FUTEX_LOCK() */ ++f->f_refcount; FUTEXES_UNLOCK; umtx_key_release(&key); FUTEX_LOCK(f); *newf = f; LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", uaddr, f->f_refcount, f->f_key.shared); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } } if (flags & FUTEX_DONTCREATE) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } if (tmpf == NULL) { FUTEXES_UNLOCK; tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); tmpf->f_uaddr = uaddr; tmpf->f_key = key; tmpf->f_refcount = 1; tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; FUTEX_INIT(tmpf); TAILQ_INIT(&tmpf->f_waiting_proc); /* * Lock the new futex before an insert into the futex_list * to prevent futex usage by other. */ FUTEX_LOCK(tmpf); goto retry; } LIST_INSERT_HEAD(&futex_list, tmpf, f_list); FUTEXES_UNLOCK; LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, tmpf->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", uaddr, tmpf->f_refcount, tmpf->f_key.shared); *newf = tmpf; LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } static int futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, uint32_t flags) { int error; LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f); if (flags & FUTEX_CREATE_WP) { *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); (*wp)->wp_flags = 0; } error = futex_get0(uaddr, f, flags); if (error) { LIN_SDT_PROBE0(futex, futex_get, error); if (flags & FUTEX_CREATE_WP) free(*wp, M_FUTEX_WP); LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } if (flags & FUTEX_CREATE_WP) { TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); (*wp)->wp_futex = *f; } LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } static int futex_sleep(struct futex *f, struct waiting_proc *wp, int timeout) { int error; FUTEX_ASSERT_LOCKED(f); LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, timeout); LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %d ref %d", f->f_uaddr, wp, timeout, f->f_refcount); error = sx_sleep(wp, &f->f_lck, PCATCH, "futex", timeout); if (wp->wp_flags & FUTEX_WP_REQUEUED) { KASSERT(f != wp->wp_futex, ("futex != wp_futex")); if (error) { LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); } LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" " %p requeued uaddr %p ref %d", error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); futex_put(f, NULL); f = wp->wp_futex; FUTEX_LOCK(f); } else { if (error) { LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, f->f_uaddr, wp); } LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", error, f->f_uaddr, wp); } futex_put(f, wp); LIN_SDT_PROBE1(futex, futex_sleep, return, error); return (error); } static int futex_wake(struct futex *f, int n, uint32_t bitset) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL); return (EINVAL); } FUTEX_ASSERT_LOCKED(f); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, f->f_refcount); LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", f->f_uaddr, wp, f->f_refcount); /* * Unless we find a matching bit in * the bitset, continue searching. */ if (!(wp->wp_futex->f_bitset & bitset)) continue; wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); wakeup_one(wp); if (++count == n) break; } LIN_SDT_PROBE1(futex, futex_wake, return, count); return (count); } static int futex_requeue(struct futex *f, int n, struct futex *f2, int n2) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2); FUTEX_ASSERT_LOCKED(f); FUTEX_ASSERT_LOCKED(f2); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { if (++count <= n) { LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", f->f_uaddr, wp); wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); wakeup_one(wp); } else { LIN_SDT_PROBE3(futex, futex_requeue, requeue, f->f_uaddr, wp, f2->f_uaddr); LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", f->f_uaddr, wp, f2->f_uaddr); wp->wp_flags |= FUTEX_WP_REQUEUED; /* Move wp to wp_list of f2 futex */ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); /* * Thread which sleeps on wp after waking should * acquire f2 lock, so increment refcount of f2 to * prevent it from premature deallocation. */ wp->wp_futex = f2; FUTEXES_LOCK; ++f2->f_refcount; FUTEXES_UNLOCK; if (count - n >= n2) break; } } LIN_SDT_PROBE1(futex, futex_requeue, return, count); return (count); } static int futex_wait(struct futex *f, struct waiting_proc *wp, int timeout_hz, uint32_t bitset) { int error; LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, timeout_hz, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL); return (EINVAL); } f->f_bitset = bitset; error = futex_sleep(f, wp, timeout_hz); if (error) LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); if (error == EWOULDBLOCK) error = ETIMEDOUT; LIN_SDT_PROBE1(futex, futex_wait, return, error); return (error); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr); if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); /* XXX: Linux verifies access here and returns EFAULT */ LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) { LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { int clockrt, nrwake, op_ret, ret; struct linux_emuldata *em; struct waiting_proc *wp; struct futex *f, *f2; struct l_timespec timeout; struct timeval utv, ctv; int timeout_hz; int error; uint32_t flags, val; LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args); if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else flags = FUTEX_SHARED; /* * Currently support for switching between CLOCK_MONOTONIC and * CLOCK_REALTIME is not present. However Linux forbids the use of * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and * FUTEX_WAIT_REQUEUE_PI. */ clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } error = 0; f = f2 = NULL; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, &wp, &f, flags | FUTEX_CREATE_WP); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin(args->uaddr, &val, sizeof(val)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val) { LIN_SDT_PROBE4(futex, linux_sys_futex, debug_wait_value_neq, args->uaddr, args->val, val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x != uval 0x%x", args->uaddr, args->val, val); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EWOULDBLOCK); return (EWOULDBLOCK); } if (args->timeout != NULL) { error = copyin(args->timeout, &timeout, sizeof(timeout)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); futex_put(f, wp); return (error); } TIMESPEC_TO_TIMEVAL(&utv, &timeout); error = itimerfix(&utv); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, itimerfix_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); futex_put(f, wp); return (error); } if (clockrt) { microtime(&ctv); timevalsub(&utv, &ctv); } else if (args->op == LINUX_FUTEX_WAIT_BITSET) { microuptime(&ctv); timevalsub(&utv, &ctv); } if (utv.tv_sec < 0) timevalclear(&utv); timeout_hz = tvtohz(&utv); } else timeout_hz = 0; error = futex_wait(f, wp, timeout_hz, args->val3); break; case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTCREATE); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (f == NULL) { td->td_retval[0] = 0; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } td->td_retval[0] = futex_wake(f, args->val, args->val3); futex_put(f, NULL); break; case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } error = futex_get(args->uaddr, NULL, &f, flags); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * To avoid deadlocks return EINVAL if second futex * exists at this time. * * Glibc fall back to FUTEX_WAKE in case of any error * returned by FUTEX_CMP_REQUEUE. */ error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTEXISTS); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin(args->uaddr, &val, sizeof(val)); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", error); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val3) { LIN_SDT_PROBE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, args->val, val); LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", args->val, val); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN); return (EAGAIN); } nrwake = (int)(unsigned long)args->timeout; td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake); futex_put(f2, NULL); futex_put(f, NULL); break; case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->timeout); error = futex_get(args->uaddr, NULL, &f, flags); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (args->uaddr != args->uaddr2) error = futex_get(args->uaddr2, NULL, &f2, flags); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * This function returns positive number as results and * negative as errors */ op_ret = futex_atomic_op(td, args->val3, args->uaddr2); LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", args->uaddr, op_ret); if (op_ret < 0) { /* XXX: We don't handle the EFAULT yet. */ if (op_ret != -EFAULT) { if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, -op_ret); return (-op_ret); } else { LIN_SDT_PROBE0(futex, linux_sys_futex, unhandled_efault); } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EFAULT); return (EFAULT); } ret = futex_wake(f, args->val, args->val3); if (op_ret > 0) { op_ret = 0; nrwake = (int)(unsigned long)args->timeout; if (f2 != NULL) op_ret += futex_wake(f2, nrwake, args->val3); else op_ret += futex_wake(f, nrwake, args->val3); ret += op_ret; } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); td->td_retval[0] = ret; break; case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_LOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_UNLOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_TRYLOCK_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ - em = em_find(td->td_proc, EMUL_DONTLOCK); + em = em_find(td); if ((em->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_requeue op\n"); em->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_sys_futex, deprecated_requeue); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op FUTEX_WAIT_REQUEUE_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ linux_msg(td, "linux_sys_futex: " "op LINUX_FUTEX_CMP_REQUEUE_PI not implemented\n"); LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); default: linux_msg(td, "linux_sys_futex: unknown op %d\n", args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation, args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args); if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL); return (EINVAL); } - em = em_find(td->td_proc, EMUL_DOLOCK); + em = em_find(td); em->robust_futexes = args->head; - EMUL_UNLOCK(&emul_lock); LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0); return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); + struct thread *td2; int error = 0; LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args); if (!args->pid) { - em = em_find(td->td_proc, EMUL_DONTLOCK); + em = em_find(td); + KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { - struct proc *p; - - p = pfind(args->pid); - if (p == NULL) { + td2 = tdfind(args->pid, -1); + if (td2 == NULL) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, ESRCH); return (ESRCH); } - em = em_find(p, EMUL_DONTLOCK); + em = em_find(td2); + KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || - p_candebug(td, p)) { - PROC_UNLOCK(p); + p_candebug(td, td2->td_proc)) { + PROC_UNLOCK(td2->td_proc); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); return (EPERM); } head = em->robust_futexes; - PROC_UNLOCK(p); + PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT); return (EFAULT); } error = copyout(head, args->head, sizeof(struct linux_robust_list_head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error); return (error); } static int -handle_futex_death(struct proc *p, uint32_t *uaddr, unsigned int pi) +handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, + unsigned int pi) { uint32_t uval, nval, mval; struct futex *f; int error; - LIN_SDT_PROBE3(futex, handle_futex_death, entry, p, uaddr, pi); + LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi); retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } - if ((uval & FUTEX_TID_MASK) == p->p_pid) { + if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) { LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_get(uaddr, NULL, &f, FUTEX_DONTCREATE | FUTEX_SHARED); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, return, error); return (error); } if (f != NULL) { futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); futex_put(f, NULL); } } } LIN_SDT_PROBE1(futex, handle_futex_death, return, 0); return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi); error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0); return (0); } /* This walks the list of robust futexes releasing them. */ void -release_futexes(struct proc *p) +release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; - struct linux_emuldata *em; l_long futex_offset; int rc, error; - LIN_SDT_PROBE1(futex, release_futexes, entry, p); + LIN_SDT_PROBE2(futex, release_futexes, entry, td, em); - em = em_find(p, EMUL_DONTLOCK); head = em->robust_futexes; if (head == NULL) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) - if (handle_futex_death(p, + if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (rc) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) - handle_futex_death(p, (uint32_t *)((caddr_t)pending + futex_offset), pip); + handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); LIN_SDT_PROBE0(futex, release_futexes, return); } Index: head/sys/compat/linux/linux_futex.h =================================================================== --- head/sys/compat/linux/linux_futex.h (revision 283382) +++ head/sys/compat/linux/linux_futex.h (revision 283383) @@ -1,81 +1,82 @@ /* $NetBSD: linux_futex.h,v 1.2 2005/12/11 12:20:19 christos Exp $ */ /*- * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_FUTEX_H #define _LINUX_FUTEX_H extern LIST_HEAD(futex_list, futex) futex_list; extern struct mtx futex_mtx; #define LINUX_FUTEX_WAIT 0 #define LINUX_FUTEX_WAKE 1 #define LINUX_FUTEX_FD 2 /* unused */ #define LINUX_FUTEX_REQUEUE 3 #define LINUX_FUTEX_CMP_REQUEUE 4 #define LINUX_FUTEX_WAKE_OP 5 #define LINUX_FUTEX_LOCK_PI 6 #define LINUX_FUTEX_UNLOCK_PI 7 #define LINUX_FUTEX_TRYLOCK_PI 8 #define LINUX_FUTEX_WAIT_BITSET 9 #define LINUX_FUTEX_WAKE_BITSET 10 #define LINUX_FUTEX_WAIT_REQUEUE_PI 11 #define LINUX_FUTEX_CMP_REQUEUE_PI 12 #define LINUX_FUTEX_PRIVATE_FLAG 128 #define LINUX_FUTEX_CLOCK_REALTIME 256 #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ #define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ #define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ #define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ #define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ #define FUTEX_OP_OPARG_SHIFT 8 /* Use (1 << OPARG) instead of OPARG. */ #define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ #define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ #define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ #define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ #define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ #define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ #define FUTEX_WAITERS 0x80000000 #define FUTEX_OWNER_DIED 0x40000000 #define FUTEX_TID_MASK 0x3fffffff #define FUTEX_BITSET_MATCH_ANY 0xffffffff -void release_futexes(struct proc *); +void release_futexes(struct thread *, + struct linux_emuldata *); #endif /* !_LINUX_FUTEX_H */ Index: head/sys/compat/linux/linux_misc.c =================================================================== --- head/sys/compat/linux/linux_misc.c (revision 283382) +++ head/sys/compat/linux/linux_misc.c (revision 283383) @@ -1,2032 +1,2010 @@ /*- * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #if defined(__i386__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif -#include #include #include #include #include #include #include #include -/* DTrace init */ -LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); - -/* Linuxulator-global DTrace probes */ -LIN_SDT_PROBE_DECLARE(locks, emul_lock, locked); -LIN_SDT_PROBE_DECLARE(locks, emul_lock, unlock); -LIN_SDT_PROBE_DECLARE(locks, emul_shared_rlock, locked); -LIN_SDT_PROBE_DECLARE(locks, emul_shared_rlock, unlock); -LIN_SDT_PROBE_DECLARE(locks, emul_shared_wlock, locked); -LIN_SDT_PROBE_DECLARE(locks, emul_shared_wlock, unlock); - int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalbig; l_ulong freebig; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; vm_object_t object; int i, j; struct timespec ts; getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - vm_cnt.v_wire_count * PAGE_SIZE; sysinfo.sharedram = 0; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) if (object->shadow_count > 1) sysinfo.sharedram += object->resident_page_count; mtx_unlock(&vm_object_list_mtx); sysinfo.sharedram *= PAGE_SIZE; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; sysinfo.freebig = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error; #ifdef DEBUG if (ldebug(alarm)) printf(ARGS(alarm, "%u"), args->secs); #endif secs = args->secs; if (secs > INT_MAX) secs = INT_MAX; it.it_value.tv_sec = (long) secs; it.it_value.tv_usec = 0; it.it_interval.tv_sec = 0; it.it_interval.tv_usec = 0; error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); if (error) return (error); if (timevalisset(&old_it.it_value)) { if (old_it.it_value.tv_usec != 0) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; } return (0); } int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; vm_offset_t new, old; struct obreak_args /* { char * nsize; } */ tmp; #ifdef DEBUG if (ldebug(brk)) printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend); #endif old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (vm_offset_t)args->dsend; tmp.nsize = (char *)new; if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp)) td->td_retval[0] = (long)new; else td->td_retval[0] = (long)old; return (0); } #if defined(__i386__) /* XXX: what about amd64/linux32? */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; char *library; ssize_t aresid; int error, locked, writecount; LCONVPATHEXIST(td, args->library, &library); #ifdef DEBUG if (ldebug(uselib)) printf(ARGS(uselib, "%s"), library); #endif a_out = NULL; locked = 0; vp = NULL; NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); if (error) goto cleanup; vp = ni.ni_vp; NDFREE(&ni, NDF_ONLY_PNBUF); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = 1; /* Writable? */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) goto cleanup; if (writecount != 0) { error = ETXTBSY; goto cleanup; } /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. * XXX: Note that if any of the VM operations fail below we don't * clear this flag. */ VOP_SET_TEXT(vp); /* * Lock no longer needed */ locked = 0; VOP_UNLOCK(vp, 0); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("uselib: Non page aligned binary %lu\n", file_offset); #endif /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { #ifdef DEBUG printf("uselib: Page aligned binary %lu\n", file_offset); #endif /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; } #ifdef DEBUG printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0], ((long *)vmaddr)[1]); #endif if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: /* Unlock vnode if needed */ if (locked) VOP_UNLOCK(vp, 0); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } #endif /* __i386__ */ int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; #ifdef DEBUG if (ldebug(select)) printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds, (void *)args->readfds, (void *)args->writefds, (void *)args->exceptfds, (void *)args->timeout); #endif /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; #ifdef DEBUG if (ldebug(select)) printf(LMSG("incoming timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, sizeof(l_int) * 8); #ifdef DEBUG if (ldebug(select)) printf(LMSG("real select returns %d"), error); #endif if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); #ifdef DEBUG if (ldebug(select)) printf(LMSG("outgoing timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: #ifdef DEBUG if (ldebug(select)) printf(LMSG("select_out -> %d"), error); #endif return (error); } int linux_mremap(struct thread *td, struct linux_mremap_args *args) { struct munmap_args /* { void *addr; size_t len; } */ bsd_args; int error = 0; #ifdef DEBUG if (ldebug(mremap)) printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"), (void *)(uintptr_t)args->addr, (unsigned long)args->old_len, (unsigned long)args->new_len, (unsigned long)args->flags); #endif if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { bsd_args.addr = (caddr_t)((uintptr_t)args->addr + args->new_len); bsd_args.len = args->old_len - args->new_len; error = sys_munmap(td, &bsd_args); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { struct msync_args bsd_args; bsd_args.addr = (caddr_t)(uintptr_t)args->addr; bsd_args.len = (uintptr_t)args->len; bsd_args.flags = args->fl & ~LINUX_MS_SYNC; return (sys_msync(td, &bsd_args)); } int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; #ifdef DEBUG if (ldebug(time)) printf(ARGS(time, "*")); #endif microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; #ifdef DEBUG if (ldebug(times)) printf(ARGS(times, "*")); #endif if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; #ifdef DEBUG if (ldebug(newuname)) printf(ARGS(newuname, "*")); #endif linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } strlcpy(utsname.machine, linux_platform, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utime)) printf(ARGS(utime, "%s, *"), fname); #endif if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utimes)) printf(ARGS(utimes, "%s, *"), fname); #endif if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &fname, dfd); #ifdef DEBUG if (ldebug(futimesat)) printf(ARGS(futimesat, "%s, *"), fname); #endif if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_common_wait(struct thread *td, int pid, int *status, int options, struct rusage *ru) { int error, tmpstat; error = kern_wait(td, pid, &tmpstat, options, ru); if (error) return (error); if (status) { tmpstat &= 0xffff; if (WIFSIGNALED(tmpstat)) tmpstat = (tmpstat & 0xffffff80) | BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat)); else if (WIFSTOPPED(tmpstat)) tmpstat = (tmpstat & 0xffff00ff) | (BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8); error = copyout(&tmpstat, status, sizeof(int)); } return (error); } int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { int options; #ifdef DEBUG if (ldebug(waitpid)) printf(ARGS(waitpid, "%d, %p, %d"), args->pid, (void *)args->status, args->options); #endif /* * this is necessary because the test in kern_wait doesn't work * because we mess with the options here */ if (args->options & ~(WUNTRACED | WNOHANG | WCONTINUED | __WCLONE)) return (EINVAL); options = (args->options & (WNOHANG | WUNTRACED)); /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ if (args->options & __WCLONE) options |= WLINUXCLONE; return (linux_common_wait(td, args->pid, args->status, options, NULL)); } int linux_mknod(struct thread *td, struct linux_mknod_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mknod)) printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHCREAT_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(mknodat)) printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, dfd, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { #ifdef DEBUG if (ldebug(personality)) printf(ARGS(personality, "%lu"), (unsigned long)args->per); #endif if (args->per != 0) return (EINVAL); /* Yes Jim, it's still a Linux... */ td->td_retval[0] = 0; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; #ifdef DEBUG if (ldebug(setitimer)) printf(ARGS(setitimer, "%p, %p"), (void *)uap->itv, (void *)uap->oitv); #endif if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); #ifdef DEBUG if (ldebug(setitimer)) { printf("setitimer: value: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec); printf("setitimer: interval: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec); } #endif error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; #ifdef DEBUG if (ldebug(getitimer)) printf(ARGS(getitimer, "%p"), (void *)uap->itv); #endif error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } int linux_nice(struct thread *td, struct linux_nice_args *args) { struct setpriority_args bsd_args; bsd_args.which = PRIO_PROCESS; bsd_args.who = 0; /* current process */ bsd_args.prio = args->inc; return (sys_setpriority(td, &bsd_args)); } int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = crcopysafe(p, newcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_TEMP); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_TEMP, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_TEMP); if (error) return (error); td->td_retval[0] = ngrp; return (0); } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; #ifdef DEBUG if (ldebug(setrlimit)) printf(ARGS(setrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(old_getrlimit)) printf(ARGS(old_getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(getrlimit)) printf(ARGS(getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { - struct sched_setscheduler_args bsd; + struct sched_param sched_param; + struct thread *tdt; + int error, policy; #ifdef DEBUG if (ldebug(sched_setscheduler)) printf(ARGS(sched_setscheduler, "%d, %d, %p"), args->pid, args->policy, (const void *)args->param); #endif switch (args->policy) { case LINUX_SCHED_OTHER: - bsd.policy = SCHED_OTHER; + policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: - bsd.policy = SCHED_FIFO; + policy = SCHED_FIFO; break; case LINUX_SCHED_RR: - bsd.policy = SCHED_RR; + policy = SCHED_RR; break; default: return (EINVAL); } - bsd.pid = args->pid; - bsd.param = (struct sched_param *)args->param; - return (sys_sched_setscheduler(td, &bsd)); + error = copyin(args->param, &sched_param, sizeof(sched_param)); + if (error) + return (error); + + tdt = linux_tdfind(td, args->pid, -1); + if (tdt == NULL) + return (ESRCH); + + error = kern_sched_setscheduler(td, tdt, policy, &sched_param); + PROC_UNLOCK(tdt->td_proc); + return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { - struct sched_getscheduler_args bsd; - int error; + struct thread *tdt; + int error, policy; #ifdef DEBUG if (ldebug(sched_getscheduler)) printf(ARGS(sched_getscheduler, "%d"), args->pid); #endif - bsd.pid = args->pid; - error = sys_sched_getscheduler(td, &bsd); + tdt = linux_tdfind(td, args->pid, -1); + if (tdt == NULL) + return (ESRCH); - switch (td->td_retval[0]) { + error = kern_sched_getscheduler(td, tdt, &policy); + PROC_UNLOCK(tdt->td_proc); + + switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } - return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_max)) printf(ARGS(sched_get_priority_max, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_min)) printf(ARGS(sched_get_priority_min, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; #ifdef DEBUG if (ldebug(reboot)) printf(ARGS(reboot, "0x%x"), args->cmd); #endif if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } /* * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that * are assumed to be preserved. The following lightweight syscalls fixes * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c * * linux_getpid() - MP SAFE * linux_getgid() - MP SAFE * linux_getuid() - MP SAFE */ int linux_getpid(struct thread *td, struct linux_getpid_args *args) { - struct linux_emuldata *em; #ifdef DEBUG if (ldebug(getpid)) printf(ARGS(getpid, "")); #endif + td->td_retval[0] = td->td_proc->p_pid; - if (linux_use26(td)) { - em = em_find(td->td_proc, EMUL_DONTLOCK); - KASSERT(em != NULL, ("getpid: emuldata not found.\n")); - td->td_retval[0] = em->shared->group_pid; - } else { - td->td_retval[0] = td->td_proc->p_pid; - } - return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { + struct linux_emuldata *em; #ifdef DEBUG if (ldebug(gettid)) printf(ARGS(gettid, "")); #endif - td->td_retval[0] = td->td_proc->p_pid; + em = em_find(td); + KASSERT(em != NULL, ("gettid: emuldata not found.\n")); + + td->td_retval[0] = em->em_tid; + return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { - struct linux_emuldata *em; - struct proc *p, *pp; #ifdef DEBUG if (ldebug(getppid)) printf(ARGS(getppid, "")); #endif - if (!linux_use26(td)) { - PROC_LOCK(td->td_proc); - td->td_retval[0] = td->td_proc->p_pptr->p_pid; - PROC_UNLOCK(td->td_proc); - return (0); - } - - em = em_find(td->td_proc, EMUL_DONTLOCK); - - KASSERT(em != NULL, ("getppid: process emuldata not found.\n")); - - /* find the group leader */ - p = pfind(em->shared->group_pid); - - if (p == NULL) { -#ifdef DEBUG - printf(LMSG("parent process not found.\n")); -#endif - return (0); - } - - pp = p->p_pptr; /* switch to parent */ - PROC_LOCK(pp); - PROC_UNLOCK(p); - - /* if its also linux process */ - if (pp->p_sysent == &elf_linux_sysvec) { - em = em_find(pp, EMUL_DONTLOCK); - KASSERT(em != NULL, ("getppid: parent emuldata not found.\n")); - - td->td_retval[0] = em->shared->group_pid; - } else - td->td_retval[0] = pp->p_pid; - - PROC_UNLOCK(pp); - + PROC_LOCK(td->td_proc); + td->td_retval[0] = td->td_proc->p_pptr->p_pid; + PROC_UNLOCK(td->td_proc); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { #ifdef DEBUG if (ldebug(getgid)) printf(ARGS(getgid, "")); #endif td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { #ifdef DEBUG if (ldebug(getuid)) printf(ARGS(getuid, "")); #endif td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { struct getsid_args bsd; #ifdef DEBUG if (ldebug(getsid)) printf(ARGS(getsid, "%i"), args->pid); #endif bsd.pid = args->pid; return (sys_getsid(td, &bsd)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { struct getpriority_args bsd_args; int error; #ifdef DEBUG if (ldebug(getpriority)) printf(ARGS(getpriority, "%i, %i"), args->which, args->who); #endif bsd_args.which = args->which; bsd_args.who = args->who; error = sys_getpriority(td, &bsd_args); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(sethostname)) printf(ARGS(sethostname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(setdomainname)) printf(ARGS(setdomainname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { - struct linux_emuldata *em; #ifdef DEBUG if (ldebug(exit_group)) printf(ARGS(exit_group, "%i"), args->error_code); #endif - em = em_find(td->td_proc, EMUL_DONTLOCK); - if (em->shared->refs > 1) { - EMUL_SHARED_WLOCK(&emul_shared_lock); - em->shared->flags |= EMUL_SHARED_HASXSTAT; - em->shared->xstat = W_EXITCODE(args->error_code, 0); - EMUL_SHARED_WUNLOCK(&emul_shared_lock); - if (linux_use26(td)) - linux_kill_threads(td, SIGKILL); - } + LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, + args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, W_EXITCODE(args->error_code, 0)); - - return (0); + /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION 0x19980330 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (args->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ bzero (&lucd, sizeof(lucd)); error = copyout(&lucd, args->datap, sizeof(lucd)); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL || args->datap == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(args->datap, &lucd, sizeof(lucd)); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ if (lucd.effective || lucd.permitted || lucd.inheritable) { linux_msg(td, "capset effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", (int)lucd.effective, (int)lucd.permitted, (int)lucd.inheritable); return (EPERM); } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; struct linux_emuldata *em; int pdeath_signal; #ifdef DEBUG if (ldebug(prctl)) printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option, args->arg2, args->arg3, args->arg4, args->arg5); #endif switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); - em = em_find(p, EMUL_DOLOCK); + em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); em->pdeath_signal = args->arg2; - EMUL_UNLOCK(&emul_lock); break; case LINUX_PR_GET_PDEATHSIG: - em = em_find(p, EMUL_DOLOCK); + em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); pdeath_signal = em->pdeath_signal; - EMUL_UNLOCK(&emul_lock); error = copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal)); break; case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; default: error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; - struct proc *p; int error; #ifdef DEBUG if (ldebug(sched_setparam)) printf(ARGS(sched_setparam, "%d, *"), uap->pid); #endif error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); - if (uap->pid == 0) { - tdt = td; - p = tdt->td_proc; - PROC_LOCK(p); - } else { - p = pfind(uap->pid); - if (p == NULL) - return (ESRCH); - /* - * XXX. Scheduling parameters are in fact per-thread - * attributes in Linux. Temporarily use the first - * thread in proc. The same for get_param(). - */ - tdt = FIRST_THREAD_IN_PROC(p); - } + tdt = linux_tdfind(td, uap->pid, -1); + if (tdt == NULL) + return (ESRCH); error = kern_sched_setparam(td, tdt, &sched_param); - PROC_UNLOCK(p); + PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; - struct proc *p; int error; #ifdef DEBUG if (ldebug(sched_getparam)) printf(ARGS(sched_getparam, "%d, *"), uap->pid); #endif - if (uap->pid == 0) { - tdt = td; - p = tdt->td_proc; - PROC_LOCK(p); - } else { - p = pfind(uap->pid); - if (p == NULL) - return (ESRCH); - tdt = FIRST_THREAD_IN_PROC(p); - } + tdt = linux_tdfind(td, uap->pid, -1); + if (tdt == NULL) + return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); - PROC_UNLOCK(p); + PROC_UNLOCK(tdt->td_proc); if (error == 0) error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { int error; + struct thread *tdt; struct cpuset_getaffinity_args cga; #ifdef DEBUG if (ldebug(sched_getaffinity)) printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); + tdt = linux_tdfind(td, args->pid, -1); + if (tdt == NULL) + return (ESRCH); + + PROC_UNLOCK(tdt->td_proc); cga.level = CPU_LEVEL_WHICH; - cga.which = CPU_WHICH_PID; - cga.id = args->pid; + cga.which = CPU_WHICH_TID; + cga.id = tdt->td_tid; cga.cpusetsize = sizeof(cpuset_t); cga.mask = (cpuset_t *) args->user_mask_ptr; if ((error = sys_cpuset_getaffinity(td, &cga)) == 0) td->td_retval[0] = sizeof(cpuset_t); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct cpuset_setaffinity_args csa; + struct thread *tdt; #ifdef DEBUG if (ldebug(sched_setaffinity)) printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); + tdt = linux_tdfind(td, args->pid, -1); + if (tdt == NULL) + return (ESRCH); + + PROC_UNLOCK(tdt->td_proc); csa.level = CPU_LEVEL_WHICH; - csa.which = CPU_WHICH_PID; - csa.id = args->pid; + csa.which = CPU_WHICH_TID; + csa.id = tdt->td_tid; csa.cpusetsize = sizeof(cpuset_t); csa.mask = (cpuset_t *) args->user_mask_ptr; return (sys_cpuset_setaffinity(td, &csa)); } int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; struct l_timespec lts; struct thread *tdt; - struct proc *p; int error; - if (uap->pid == 0) { - tdt = td; - p = tdt->td_proc; - PROC_LOCK(p); - } else { - p = pfind(uap->pid); - if (p == NULL) - return (ESRCH); - tdt = FIRST_THREAD_IN_PROC(p); - } + tdt = linux_tdfind(td, uap->pid, -1); + if (tdt == NULL) + return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, &ts); - PROC_UNLOCK(p); + PROC_UNLOCK(tdt->td_proc); if (error != 0) return (error); lts.tv_sec = ts.tv_sec; lts.tv_nsec = ts.tv_nsec; return (copyout(<s, uap->interval, sizeof(lts))); +} + +/* + * In case when the Linux thread is the initial thread in + * the thread group thread id is equal to the process id. + * Glibc depends on this magic (assert in pthread_getattr_np.c). + */ +struct thread * +linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) +{ + struct linux_emuldata *em; + struct thread *tdt; + struct proc *p; + + tdt = NULL; + if (tid == 0 || tid == td->td_tid) { + tdt = td; + PROC_LOCK(tdt->td_proc); + } else if (tid > PID_MAX) + tdt = tdfind(tid, pid); + else { + /* + * Initial thread where the tid equal to the pid. + */ + p = pfind(tid); + if (p != NULL) { + if (SV_PROC_ABI(p) != SV_ABI_LINUX) { + /* + * p is not a Linuxulator process. + */ + PROC_UNLOCK(p); + return (NULL); + } + FOREACH_THREAD_IN_PROC(p, tdt) { + em = em_find(tdt); + if (tid == em->em_tid) + return (tdt); + } + PROC_UNLOCK(p); + } + return (NULL); + } + + return (tdt); } Index: head/sys/compat/linux/linux_misc.h =================================================================== --- head/sys/compat/linux/linux_misc.h (revision 283382) +++ head/sys/compat/linux/linux_misc.h (revision 283383) @@ -1,127 +1,128 @@ /*- * Copyright (c) 2006 Roman Divacky * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_MISC_H_ #define _LINUX_MISC_H_ /* * Miscellaneous */ #define LINUX_NAME_MAX 255 #define LINUX_MAX_UTSNAME 65 #define LINUX_CTL_MAXNAME 10 /* defines for prctl */ #define LINUX_PR_SET_PDEATHSIG 1 /* Second arg is a signal. */ #define LINUX_PR_GET_PDEATHSIG 2 /* * Second arg is a ptr to return the * signal. */ #define LINUX_PR_GET_KEEPCAPS 7 /* Get drop capabilities on setuid */ #define LINUX_PR_SET_KEEPCAPS 8 /* Set drop capabilities on setuid */ #define LINUX_PR_SET_NAME 15 /* Set process name. */ #define LINUX_PR_GET_NAME 16 /* Get process name. */ #define LINUX_MAX_COMM_LEN 16 /* Maximum length of the process name. */ #define LINUX_MREMAP_MAYMOVE 1 #define LINUX_MREMAP_FIXED 2 #define LINUX_PATH_MAX 4096 extern const char *linux_platform; /* * Non-standard aux entry types used in Linux ELF binaries. */ #define LINUX_AT_PLATFORM 15 /* String identifying CPU */ #define LINUX_AT_HWCAP 16 /* CPU capabilities */ #define LINUX_AT_CLKTCK 17 /* frequency at which times() increments */ #define LINUX_AT_SECURE 23 /* secure mode boolean */ #define LINUX_AT_BASE_PLATFORM 24 /* string identifying real platform, may * differ from AT_PLATFORM. */ #define LINUX_AT_EXECFN 31 /* filename of program */ /* Linux sets the i387 to extended precision. */ #if defined(__i386__) || defined(__amd64__) #define __LINUX_NPXCW__ 0x37f #endif #define LINUX_CLONE_VM 0x00000100 #define LINUX_CLONE_FS 0x00000200 #define LINUX_CLONE_FILES 0x00000400 #define LINUX_CLONE_SIGHAND 0x00000800 #define LINUX_CLONE_PID 0x00001000 /* No longer exist in Linux */ #define LINUX_CLONE_VFORK 0x00004000 #define LINUX_CLONE_PARENT 0x00008000 #define LINUX_CLONE_THREAD 0x00010000 #define LINUX_CLONE_SETTLS 0x00080000 #define LINUX_CLONE_PARENT_SETTID 0x00100000 #define LINUX_CLONE_CHILD_CLEARTID 0x00200000 #define LINUX_CLONE_CHILD_SETTID 0x01000000 #define LINUX_THREADING_FLAGS \ (LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES | \ LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD) /* Scheduling policies */ #define LINUX_SCHED_OTHER 0 #define LINUX_SCHED_FIFO 1 #define LINUX_SCHED_RR 2 struct l_new_utsname { char sysname[LINUX_MAX_UTSNAME]; char nodename[LINUX_MAX_UTSNAME]; char release[LINUX_MAX_UTSNAME]; char version[LINUX_MAX_UTSNAME]; char machine[LINUX_MAX_UTSNAME]; char domainname[LINUX_MAX_UTSNAME]; }; #define LINUX_CLOCK_REALTIME 0 #define LINUX_CLOCK_MONOTONIC 1 #define LINUX_CLOCK_PROCESS_CPUTIME_ID 2 #define LINUX_CLOCK_THREAD_CPUTIME_ID 3 #define LINUX_CLOCK_REALTIME_HR 4 #define LINUX_CLOCK_MONOTONIC_HR 5 extern int stclohz; #define __WCLONE 0x80000000 int linux_common_wait(struct thread *td, int pid, int *status, int options, struct rusage *ru); int linux_set_upcall_kse(struct thread *td, register_t stack); int linux_set_cloned_tls(struct thread *td, void *desc); +struct thread *linux_tdfind(struct thread *, lwpid_t, pid_t); #endif /* _LINUX_MISC_H_ */ Index: head/sys/compat/linux/linux_signal.c =================================================================== --- head/sys/compat/linux/linux_signal.c (revision 283382) +++ head/sys/compat/linux/linux_signal.c (revision 283383) @@ -1,656 +1,676 @@ /*- * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include +#include +static int linux_do_tkill(struct thread *td, struct thread *tdt, + ksiginfo_t *ksi); + void linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss) { int b, l; SIGEMPTYSET(*bss); bss->__bits[0] = lss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1); bss->__bits[1] = lss->__bits[1]; for (l = 1; l <= LINUX_SIGTBLSZ; l++) { if (LINUX_SIGISMEMBER(*lss, l)) { b = linux_to_bsd_signal[_SIG_IDX(l)]; if (b) SIGADDSET(*bss, b); } } } void bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss) { int b, l; LINUX_SIGEMPTYSET(*lss); lss->__bits[0] = bss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1); lss->__bits[1] = bss->__bits[1]; for (b = 1; b <= LINUX_SIGTBLSZ; b++) { if (SIGISMEMBER(*bss, b)) { l = bsd_to_linux_signal[_SIG_IDX(b)]; if (l) LINUX_SIGADDSET(*lss, l); } } } static void linux_to_bsd_sigaction(l_sigaction_t *lsa, struct sigaction *bsa) { linux_to_bsd_sigset(&lsa->lsa_mask, &bsa->sa_mask); bsa->sa_handler = PTRIN(lsa->lsa_handler); bsa->sa_flags = 0; if (lsa->lsa_flags & LINUX_SA_NOCLDSTOP) bsa->sa_flags |= SA_NOCLDSTOP; if (lsa->lsa_flags & LINUX_SA_NOCLDWAIT) bsa->sa_flags |= SA_NOCLDWAIT; if (lsa->lsa_flags & LINUX_SA_SIGINFO) bsa->sa_flags |= SA_SIGINFO; if (lsa->lsa_flags & LINUX_SA_ONSTACK) bsa->sa_flags |= SA_ONSTACK; if (lsa->lsa_flags & LINUX_SA_RESTART) bsa->sa_flags |= SA_RESTART; if (lsa->lsa_flags & LINUX_SA_ONESHOT) bsa->sa_flags |= SA_RESETHAND; if (lsa->lsa_flags & LINUX_SA_NOMASK) bsa->sa_flags |= SA_NODEFER; } static void bsd_to_linux_sigaction(struct sigaction *bsa, l_sigaction_t *lsa) { bsd_to_linux_sigset(&bsa->sa_mask, &lsa->lsa_mask); #ifdef COMPAT_LINUX32 lsa->lsa_handler = (uintptr_t)bsa->sa_handler; #else lsa->lsa_handler = bsa->sa_handler; #endif lsa->lsa_restorer = 0; /* unsupported */ lsa->lsa_flags = 0; if (bsa->sa_flags & SA_NOCLDSTOP) lsa->lsa_flags |= LINUX_SA_NOCLDSTOP; if (bsa->sa_flags & SA_NOCLDWAIT) lsa->lsa_flags |= LINUX_SA_NOCLDWAIT; if (bsa->sa_flags & SA_SIGINFO) lsa->lsa_flags |= LINUX_SA_SIGINFO; if (bsa->sa_flags & SA_ONSTACK) lsa->lsa_flags |= LINUX_SA_ONSTACK; if (bsa->sa_flags & SA_RESTART) lsa->lsa_flags |= LINUX_SA_RESTART; if (bsa->sa_flags & SA_RESETHAND) lsa->lsa_flags |= LINUX_SA_ONESHOT; if (bsa->sa_flags & SA_NODEFER) lsa->lsa_flags |= LINUX_SA_NOMASK; } int linux_do_sigaction(struct thread *td, int linux_sig, l_sigaction_t *linux_nsa, l_sigaction_t *linux_osa) { struct sigaction act, oact, *nsa, *osa; int error, sig; if (!LINUX_SIG_VALID(linux_sig)) return (EINVAL); osa = (linux_osa != NULL) ? &oact : NULL; if (linux_nsa != NULL) { nsa = &act; linux_to_bsd_sigaction(linux_nsa, nsa); } else nsa = NULL; if (linux_sig <= LINUX_SIGTBLSZ) sig = linux_to_bsd_signal[_SIG_IDX(linux_sig)]; else sig = linux_sig; error = kern_sigaction(td, sig, nsa, osa, 0); if (error) return (error); if (linux_osa != NULL) bsd_to_linux_sigaction(osa, linux_osa); return (0); } int linux_signal(struct thread *td, struct linux_signal_args *args) { l_sigaction_t nsa, osa; int error; #ifdef DEBUG if (ldebug(signal)) printf(ARGS(signal, "%d, %p"), args->sig, (void *)(uintptr_t)args->handler); #endif nsa.lsa_handler = args->handler; nsa.lsa_flags = LINUX_SA_ONESHOT | LINUX_SA_NOMASK; LINUX_SIGEMPTYSET(nsa.lsa_mask); error = linux_do_sigaction(td, args->sig, &nsa, &osa); td->td_retval[0] = (int)(intptr_t)osa.lsa_handler; return (error); } int linux_rt_sigaction(struct thread *td, struct linux_rt_sigaction_args *args) { l_sigaction_t nsa, osa; int error; #ifdef DEBUG if (ldebug(rt_sigaction)) printf(ARGS(rt_sigaction, "%ld, %p, %p, %ld"), (long)args->sig, (void *)args->act, (void *)args->oact, (long)args->sigsetsize); #endif if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); if (args->act != NULL) { error = copyin(args->act, &nsa, sizeof(l_sigaction_t)); if (error) return (error); } error = linux_do_sigaction(td, args->sig, args->act ? &nsa : NULL, args->oact ? &osa : NULL); if (args->oact != NULL && !error) { error = copyout(&osa, args->oact, sizeof(l_sigaction_t)); } return (error); } static int linux_do_sigprocmask(struct thread *td, int how, l_sigset_t *new, l_sigset_t *old) { sigset_t omask, nmask; sigset_t *nmaskp; int error; td->td_retval[0] = 0; switch (how) { case LINUX_SIG_BLOCK: how = SIG_BLOCK; break; case LINUX_SIG_UNBLOCK: how = SIG_UNBLOCK; break; case LINUX_SIG_SETMASK: how = SIG_SETMASK; break; default: return (EINVAL); } if (new != NULL) { linux_to_bsd_sigset(new, &nmask); nmaskp = &nmask; } else nmaskp = NULL; error = kern_sigprocmask(td, how, nmaskp, &omask, 0); if (error == 0 && old != NULL) bsd_to_linux_sigset(&omask, old); return (error); } int linux_sigprocmask(struct thread *td, struct linux_sigprocmask_args *args) { l_osigset_t mask; l_sigset_t set, oset; int error; #ifdef DEBUG if (ldebug(sigprocmask)) printf(ARGS(sigprocmask, "%d, *, *"), args->how); #endif if (args->mask != NULL) { error = copyin(args->mask, &mask, sizeof(l_osigset_t)); if (error) return (error); LINUX_SIGEMPTYSET(set); set.__bits[0] = mask; } error = linux_do_sigprocmask(td, args->how, args->mask ? &set : NULL, args->omask ? &oset : NULL); if (args->omask != NULL && !error) { mask = oset.__bits[0]; error = copyout(&mask, args->omask, sizeof(l_osigset_t)); } return (error); } int linux_rt_sigprocmask(struct thread *td, struct linux_rt_sigprocmask_args *args) { l_sigset_t set, oset; int error; #ifdef DEBUG if (ldebug(rt_sigprocmask)) printf(ARGS(rt_sigprocmask, "%d, %p, %p, %ld"), args->how, (void *)args->mask, (void *)args->omask, (long)args->sigsetsize); #endif if (args->sigsetsize != sizeof(l_sigset_t)) return EINVAL; if (args->mask != NULL) { error = copyin(args->mask, &set, sizeof(l_sigset_t)); if (error) return (error); } error = linux_do_sigprocmask(td, args->how, args->mask ? &set : NULL, args->omask ? &oset : NULL); if (args->omask != NULL && !error) { error = copyout(&oset, args->omask, sizeof(l_sigset_t)); } return (error); } int linux_sgetmask(struct thread *td, struct linux_sgetmask_args *args) { struct proc *p = td->td_proc; l_sigset_t mask; #ifdef DEBUG if (ldebug(sgetmask)) printf(ARGS(sgetmask, "")); #endif PROC_LOCK(p); bsd_to_linux_sigset(&td->td_sigmask, &mask); PROC_UNLOCK(p); td->td_retval[0] = mask.__bits[0]; return (0); } int linux_ssetmask(struct thread *td, struct linux_ssetmask_args *args) { struct proc *p = td->td_proc; l_sigset_t lset; sigset_t bset; #ifdef DEBUG if (ldebug(ssetmask)) printf(ARGS(ssetmask, "%08lx"), (unsigned long)args->mask); #endif PROC_LOCK(p); bsd_to_linux_sigset(&td->td_sigmask, &lset); td->td_retval[0] = lset.__bits[0]; LINUX_SIGEMPTYSET(lset); lset.__bits[0] = args->mask; linux_to_bsd_sigset(&lset, &bset); td->td_sigmask = bset; SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); return (0); } /* * MPSAFE */ int linux_sigpending(struct thread *td, struct linux_sigpending_args *args) { struct proc *p = td->td_proc; sigset_t bset; l_sigset_t lset; l_osigset_t mask; #ifdef DEBUG if (ldebug(sigpending)) printf(ARGS(sigpending, "*")); #endif PROC_LOCK(p); bset = p->p_siglist; SIGSETOR(bset, td->td_siglist); SIGSETAND(bset, td->td_sigmask); PROC_UNLOCK(p); bsd_to_linux_sigset(&bset, &lset); mask = lset.__bits[0]; return (copyout(&mask, args->mask, sizeof(mask))); } /* * MPSAFE */ int linux_rt_sigpending(struct thread *td, struct linux_rt_sigpending_args *args) { struct proc *p = td->td_proc; sigset_t bset; l_sigset_t lset; if (args->sigsetsize > sizeof(lset)) return EINVAL; /* NOT REACHED */ #ifdef DEBUG if (ldebug(rt_sigpending)) printf(ARGS(rt_sigpending, "*")); #endif PROC_LOCK(p); bset = p->p_siglist; SIGSETOR(bset, td->td_siglist); SIGSETAND(bset, td->td_sigmask); PROC_UNLOCK(p); bsd_to_linux_sigset(&bset, &lset); return (copyout(&lset, args->set, args->sigsetsize)); } /* * MPSAFE */ int linux_rt_sigtimedwait(struct thread *td, struct linux_rt_sigtimedwait_args *args) { int error, sig; l_timeval ltv; struct timeval tv; struct timespec ts, *tsa; l_sigset_t lset; sigset_t bset; l_siginfo_t linfo; ksiginfo_t info; #ifdef DEBUG if (ldebug(rt_sigtimedwait)) printf(ARGS(rt_sigtimedwait, "*")); #endif if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); if ((error = copyin(args->mask, &lset, sizeof(lset)))) return (error); linux_to_bsd_sigset(&lset, &bset); tsa = NULL; if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) return (error); #ifdef DEBUG if (ldebug(rt_sigtimedwait)) printf(LMSG("linux_rt_sigtimedwait: " "incoming timeout (%d/%d)\n"), ltv.tv_sec, ltv.tv_usec); #endif tv.tv_sec = (long)ltv.tv_sec; tv.tv_usec = (suseconds_t)ltv.tv_usec; if (itimerfix(&tv)) { /* * The timeout was invalid. Convert it to something * valid that will act as it does under Linux. */ tv.tv_sec += tv.tv_usec / 1000000; tv.tv_usec %= 1000000; if (tv.tv_usec < 0) { tv.tv_sec -= 1; tv.tv_usec += 1000000; } if (tv.tv_sec < 0) timevalclear(&tv); #ifdef DEBUG if (ldebug(rt_sigtimedwait)) printf(LMSG("linux_rt_sigtimedwait: " "converted timeout (%jd/%ld)\n"), (intmax_t)tv.tv_sec, tv.tv_usec); #endif } TIMEVAL_TO_TIMESPEC(&tv, &ts); tsa = &ts; } error = kern_sigtimedwait(td, bset, &info, tsa); #ifdef DEBUG if (ldebug(rt_sigtimedwait)) printf(LMSG("linux_rt_sigtimedwait: " "sigtimedwait returning (%d)\n"), error); #endif if (error) return (error); sig = BSD_TO_LINUX_SIGNAL(info.ksi_signo); if (args->ptr) { memset(&linfo, 0, sizeof(linfo)); ksiginfo_to_lsiginfo(&info, &linfo, sig); error = copyout(&linfo, args->ptr, sizeof(linfo)); } if (error == 0) td->td_retval[0] = sig; return (error); } int linux_kill(struct thread *td, struct linux_kill_args *args) { struct kill_args /* { int pid; int signum; } */ tmp; #ifdef DEBUG if (ldebug(kill)) printf(ARGS(kill, "%d, %d"), args->pid, args->signum); #endif /* * Allow signal 0 as a means to check for privileges */ if (!LINUX_SIG_VALID(args->signum) && args->signum != 0) return (EINVAL); if (args->signum > 0 && args->signum <= LINUX_SIGTBLSZ) tmp.signum = linux_to_bsd_signal[_SIG_IDX(args->signum)]; else tmp.signum = args->signum; tmp.pid = args->pid; return (sys_kill(td, &tmp)); } static int -linux_do_tkill(struct thread *td, l_int tgid, l_int pid, l_int signum) +linux_do_tkill(struct thread *td, struct thread *tdt, ksiginfo_t *ksi) { - struct proc *proc = td->td_proc; - struct linux_emuldata *em; struct proc *p; - ksiginfo_t ksi; int error; - AUDIT_ARG_SIGNUM(signum); - AUDIT_ARG_PID(pid); - - /* - * Allow signal 0 as a means to check for privileges - */ - if (!LINUX_SIG_VALID(signum) && signum != 0) - return (EINVAL); - - if (signum > 0 && signum <= LINUX_SIGTBLSZ) - signum = linux_to_bsd_signal[_SIG_IDX(signum)]; - - if ((p = pfind(pid)) == NULL) { - if ((p = zpfind(pid)) == NULL) - return (ESRCH); - } - + p = tdt->td_proc; + AUDIT_ARG_SIGNUM(ksi->ksi_signo); + AUDIT_ARG_PID(p->p_pid); AUDIT_ARG_PROCESS(p); - error = p_cansignal(td, p, signum); - if (error != 0 || signum == 0) - goto out; - error = ESRCH; - em = em_find(p, EMUL_DONTLOCK); - - if (em == NULL) { -#ifdef DEBUG - printf("emuldata not found in do_tkill.\n"); -#endif + error = p_cansignal(td, p, ksi->ksi_signo); + if (error != 0 || ksi->ksi_signo == 0) goto out; - } - if (tgid > 0 && em->shared->group_pid != tgid) - goto out; - ksiginfo_init(&ksi); - ksi.ksi_signo = signum; - ksi.ksi_code = LINUX_SI_TKILL; - ksi.ksi_errno = 0; - ksi.ksi_pid = proc->p_pid; - ksi.ksi_uid = proc->p_ucred->cr_ruid; + tdksignal(tdt, ksi->ksi_signo, ksi); - error = pksignal(p, ksi.ksi_signo, &ksi); - out: PROC_UNLOCK(p); return (error); } int linux_tgkill(struct thread *td, struct linux_tgkill_args *args) { + struct thread *tdt; + ksiginfo_t ksi; + int sig; #ifdef DEBUG if (ldebug(tgkill)) - printf(ARGS(tgkill, "%d, %d, %d"), args->tgid, args->pid, args->sig); + printf(ARGS(tgkill, "%d, %d, %d"), + args->tgid, args->pid, args->sig); #endif + if (args->pid <= 0 || args->tgid <=0) return (EINVAL); - return (linux_do_tkill(td, args->tgid, args->pid, args->sig)); + /* + * Allow signal 0 as a means to check for privileges + */ + if (!LINUX_SIG_VALID(args->sig) && args->sig != 0) + return (EINVAL); + + if (args->sig > 0 && args->sig <= LINUX_SIGTBLSZ) + sig = linux_to_bsd_signal[_SIG_IDX(args->sig)]; + else + sig = args->sig; + + tdt = linux_tdfind(td, args->pid, args->tgid); + if (tdt == NULL) + return (ESRCH); + + ksiginfo_init(&ksi); + ksi.ksi_signo = sig; + ksi.ksi_code = LINUX_SI_TKILL; + ksi.ksi_errno = 0; + ksi.ksi_pid = td->td_proc->p_pid; + ksi.ksi_uid = td->td_proc->p_ucred->cr_ruid; + return (linux_do_tkill(td, tdt, &ksi)); } +/* + * Deprecated since 2.5.75. Replaced by tgkill(). + */ int linux_tkill(struct thread *td, struct linux_tkill_args *args) { + struct thread *tdt; + ksiginfo_t ksi; + int sig; + #ifdef DEBUG if (ldebug(tkill)) printf(ARGS(tkill, "%i, %i"), args->tid, args->sig); #endif if (args->tid <= 0) return (EINVAL); - return (linux_do_tkill(td, 0, args->tid, args->sig)); + if (!LINUX_SIG_VALID(args->sig)) + return (EINVAL); + + if (args->sig > 0 && args->sig <= LINUX_SIGTBLSZ) + sig = linux_to_bsd_signal[_SIG_IDX(args->sig)]; + else + sig = args->sig; + + tdt = linux_tdfind(td, args->tid, -1); + if (tdt == NULL) + return (ESRCH); + + ksiginfo_init(&ksi); + ksi.ksi_signo = sig; + ksi.ksi_code = LINUX_SI_TKILL; + ksi.ksi_errno = 0; + ksi.ksi_pid = td->td_proc->p_pid; + ksi.ksi_uid = td->td_proc->p_ucred->cr_ruid; + return (linux_do_tkill(td, tdt, &ksi)); } void ksiginfo_to_lsiginfo(ksiginfo_t *ksi, l_siginfo_t *lsi, l_int sig) { lsi->lsi_signo = sig; lsi->lsi_code = ksi->ksi_code; switch (sig) { case LINUX_SIGPOLL: /* XXX si_fd? */ lsi->lsi_band = ksi->ksi_band; break; case LINUX_SIGCHLD: lsi->lsi_pid = ksi->ksi_pid; lsi->lsi_uid = ksi->ksi_uid; lsi->lsi_status = ksi->ksi_status; break; case LINUX_SIGBUS: case LINUX_SIGILL: case LINUX_SIGFPE: case LINUX_SIGSEGV: lsi->lsi_addr = PTROUT(ksi->ksi_addr); break; default: /* XXX SI_TIMER etc... */ lsi->lsi_pid = ksi->ksi_pid; lsi->lsi_uid = ksi->ksi_uid; break; } if (sig >= LINUX_SIGRTMIN) { lsi->lsi_int = ksi->ksi_info.si_value.sival_int; lsi->lsi_ptr = PTROUT(ksi->ksi_info.si_value.sival_ptr); } } Index: head/sys/compat/linux/stats_timing.d =================================================================== --- head/sys/compat/linux/stats_timing.d (revision 283382) +++ head/sys/compat/linux/stats_timing.d (revision 283383) @@ -1,94 +1,93 @@ #!/usr/sbin/dtrace -qs /*- * Copyright (c) 2008-2012 Alexander Leidinger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /** * Some statistics (all per provider): * - number of calls to a function per executable binary (not per PID!) * - allows to see where an optimization would be beneficial for a given * application * - graph of CPU time spend in functions per executable binary * - together with the number of calls to this function this allows * to determine if a kernel optimization would be beneficial / is * possible for a given application * - graph of longest running (CPU-time!) function in total * - may help finding problem cases in the kernel code - * - timing statistics for the emul_lock * - graph of longest held (CPU-time!) locks */ #pragma D option dynvarsize=32m linuxulator*:::entry { self->time[probefunc] = vtimestamp; @calls[probeprov, execname, probefunc] = count(); } linuxulator*:::return /self->time[probefunc] != 0/ { this->timediff = self->time[probefunc] - vtimestamp; @stats[probeprov, execname, probefunc] = quantize(this->timediff); @longest[probeprov, probefunc] = max(this->timediff); self->time[probefunc] = 0; } linuxulator*:::locked { self->lock[arg0] = vtimestamp; } linuxulator*:::unlock /self->lock[arg0] != 0/ { this->timediff = self->lock[arg0] - vtimestamp; @lockstats[probefunc] = quantize(this->timediff); @longlock[probefunc] = max(this->timediff); self->lock[arg0] = 0; } END { printf("Number of calls per provider/application/kernel function:"); printa(@calls); printf("CPU-timing statistics per provider/application/kernel function (in ns):"); printa(@stats); printf("Longest running (CPU-time!) functions per provider (in ns):"); printa(@longest); printf("Lock CPU-timing statistics:"); printa(@lockstats); printf("Longest running (CPU-time!) locks:"); printa(@longlock); } Index: head/sys/i386/linux/linux_machdep.c =================================================================== --- head/sys/i386/linux/linux_machdep.c (revision 283382) +++ head/sys/i386/linux/linux_machdep.c (revision 283383) @@ -1,1081 +1,1080 @@ /*- * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needed for pcb definition in linux_set_thread_area */ #include "opt_posix.h" extern struct sysentvec elf32_freebsd_sysvec; /* defined in i386/i386/elf_machdep.c */ struct l_descriptor { l_uint entry_number; l_ulong base_addr; l_uint limit; l_uint seg_32bit:1; l_uint contents:2; l_uint read_exec_only:1; l_uint limit_in_pages:1; l_uint seg_not_present:1; l_uint useable:1; }; struct l_old_select_argv { l_int nfds; l_fd_set *readfds; l_fd_set *writefds; l_fd_set *exceptfds; struct l_timeval *timeout; }; static int linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, l_int flags, l_int fd, l_loff_t pos); int linux_to_bsd_sigaltstack(int lsa) { int bsa = 0; if (lsa & LINUX_SS_DISABLE) bsa |= SS_DISABLE; if (lsa & LINUX_SS_ONSTACK) bsa |= SS_ONSTACK; return (bsa); } int bsd_to_linux_sigaltstack(int bsa) { int lsa = 0; if (bsa & SS_DISABLE) lsa |= LINUX_SS_DISABLE; if (bsa & SS_ONSTACK) lsa |= LINUX_SS_ONSTACK; return (lsa); } int linux_execve(struct thread *td, struct linux_execve_args *args) { struct image_args eargs; struct vmspace *oldvmspace; char *newpath; int error; LCONVPATHEXIST(td, args->path, &newpath); #ifdef DEBUG if (ldebug(execve)) printf(ARGS(execve, "%s"), newpath); #endif error = pre_execve(td, &oldvmspace); if (error != 0) { free(newpath, M_TEMP); return (error); } error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE, args->argp, args->envp); free(newpath, M_TEMP); if (error == 0) error = kern_execve(td, &eargs, NULL); - if (error == 0) { - /* linux process can exec fbsd one, dont attempt - * to create emuldata for such process using - * linux_proc_init, this leads to a panic on KASSERT - * because such process has p->p_emuldata == NULL - */ - if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX) - error = linux_proc_init(td, 0, 0); - } + if (error == 0) + error = linux_common_execve(td, &eargs); post_execve(td, error, oldvmspace); return (error); } struct l_ipc_kludge { struct l_msgbuf *msgp; l_long msgtyp; }; int linux_ipc(struct thread *td, struct linux_ipc_args *args) { switch (args->what & 0xFFFF) { case LINUX_SEMOP: { struct linux_semop_args a; a.semid = args->arg1; a.tsops = args->ptr; a.nsops = args->arg2; return (linux_semop(td, &a)); } case LINUX_SEMGET: { struct linux_semget_args a; a.key = args->arg1; a.nsems = args->arg2; a.semflg = args->arg3; return (linux_semget(td, &a)); } case LINUX_SEMCTL: { struct linux_semctl_args a; int error; a.semid = args->arg1; a.semnum = args->arg2; a.cmd = args->arg3; error = copyin(args->ptr, &a.arg, sizeof(a.arg)); if (error) return (error); return (linux_semctl(td, &a)); } case LINUX_MSGSND: { struct linux_msgsnd_args a; a.msqid = args->arg1; a.msgp = args->ptr; a.msgsz = args->arg2; a.msgflg = args->arg3; return (linux_msgsnd(td, &a)); } case LINUX_MSGRCV: { struct linux_msgrcv_args a; a.msqid = args->arg1; a.msgsz = args->arg2; a.msgflg = args->arg3; if ((args->what >> 16) == 0) { struct l_ipc_kludge tmp; int error; if (args->ptr == NULL) return (EINVAL); error = copyin(args->ptr, &tmp, sizeof(tmp)); if (error) return (error); a.msgp = tmp.msgp; a.msgtyp = tmp.msgtyp; } else { a.msgp = args->ptr; a.msgtyp = args->arg5; } return (linux_msgrcv(td, &a)); } case LINUX_MSGGET: { struct linux_msgget_args a; a.key = args->arg1; a.msgflg = args->arg2; return (linux_msgget(td, &a)); } case LINUX_MSGCTL: { struct linux_msgctl_args a; a.msqid = args->arg1; a.cmd = args->arg2; a.buf = args->ptr; return (linux_msgctl(td, &a)); } case LINUX_SHMAT: { struct linux_shmat_args a; a.shmid = args->arg1; a.shmaddr = args->ptr; a.shmflg = args->arg2; a.raddr = (l_ulong *)args->arg3; return (linux_shmat(td, &a)); } case LINUX_SHMDT: { struct linux_shmdt_args a; a.shmaddr = args->ptr; return (linux_shmdt(td, &a)); } case LINUX_SHMGET: { struct linux_shmget_args a; a.key = args->arg1; a.size = args->arg2; a.shmflg = args->arg3; return (linux_shmget(td, &a)); } case LINUX_SHMCTL: { struct linux_shmctl_args a; a.shmid = args->arg1; a.cmd = args->arg2; a.buf = args->ptr; return (linux_shmctl(td, &a)); } default: break; } return (EINVAL); } int linux_old_select(struct thread *td, struct linux_old_select_args *args) { struct l_old_select_argv linux_args; struct linux_select_args newsel; int error; #ifdef DEBUG if (ldebug(old_select)) printf(ARGS(old_select, "%p"), args->ptr); #endif error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); newsel.nfds = linux_args.nfds; newsel.readfds = linux_args.readfds; newsel.writefds = linux_args.writefds; newsel.exceptfds = linux_args.exceptfds; newsel.timeout = linux_args.timeout; return (linux_select(td, &newsel)); } int linux_set_cloned_tls(struct thread *td, void *desc) { struct segment_descriptor sd; struct l_user_desc info; int idx, error; int a[2]; error = copyin(desc, &info, sizeof(struct l_user_desc)); if (error) { printf(LMSG("copyin failed!")); } else { idx = info.entry_number; /* * looks like we're getting the idx we returned * in the set_thread_area() syscall */ if (idx != 6 && idx != 3) { printf(LMSG("resetting idx!")); idx = 3; } /* this doesnt happen in practice */ if (idx == 6) { /* we might copy out the entry_number as 3 */ info.entry_number = 3; error = copyout(&info, desc, sizeof(struct l_user_desc)); if (error) printf(LMSG("copyout failed!")); } a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); memcpy(&sd, &a, sizeof(a)); #ifdef DEBUG if (ldebug(clone)) printf("Segment created in clone with " "CLONE_SETTLS: lobase: %x, hibase: %x, " "lolimit: %x, hilimit: %x, type: %i, " "dpl: %i, p: %i, xx: %i, def32: %i, " "gran: %i\n", sd.sd_lobase, sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, sd.sd_def32, sd.sd_gran); #endif /* set %gs */ td->td_pcb->pcb_gsd = sd; td->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL); } return (error); } int linux_set_upcall_kse(struct thread *td, register_t stack) { - td->td_frame->tf_esp = stack; + if (stack) + td->td_frame->tf_esp = stack; + /* + * The newly created Linux thread returns + * to the user space by the same path that a parent do. + */ + td->td_frame->tf_eax = 0; return (0); } #define STACK_SIZE (2 * 1024 * 1024) #define GUARD_SIZE (4 * PAGE_SIZE) int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { #ifdef DEBUG if (ldebug(mmap2)) printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"), (void *)args->addr, args->len, args->prot, args->flags, args->fd, args->pgoff); #endif return (linux_mmap_common(td, args->addr, args->len, args->prot, args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * PAGE_SIZE)); } int linux_mmap(struct thread *td, struct linux_mmap_args *args) { int error; struct l_mmap_argv linux_args; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); #ifdef DEBUG if (ldebug(mmap)) printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"), (void *)linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, linux_args.pgoff); #endif return (linux_mmap_common(td, linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, (uint32_t)linux_args.pgoff)); } static int linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot, l_int flags, l_int fd, l_loff_t pos) { struct proc *p = td->td_proc; struct mmap_args /* { caddr_t addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; } */ bsd_args; int error; struct file *fp; cap_rights_t rights; error = 0; bsd_args.flags = 0; fp = NULL; /* * Linux mmap(2): * You must specify exactly one of MAP_SHARED and MAP_PRIVATE */ if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) return (EINVAL); if (flags & LINUX_MAP_SHARED) bsd_args.flags |= MAP_SHARED; if (flags & LINUX_MAP_PRIVATE) bsd_args.flags |= MAP_PRIVATE; if (flags & LINUX_MAP_FIXED) bsd_args.flags |= MAP_FIXED; if (flags & LINUX_MAP_ANON) { /* Enforce pos to be on page boundary, then ignore. */ if ((pos & PAGE_MASK) != 0) return (EINVAL); pos = 0; bsd_args.flags |= MAP_ANON; } else bsd_args.flags |= MAP_NOSYNC; if (flags & LINUX_MAP_GROWSDOWN) bsd_args.flags |= MAP_STACK; /* * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC * on Linux/i386. We do this to ensure maximum compatibility. * Linux/ia64 does the same in i386 emulation mode. */ bsd_args.prot = prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd; if (bsd_args.fd != -1) { /* * Linux follows Solaris mmap(2) description: * The file descriptor fildes is opened with * read permission, regardless of the * protection options specified. * * Checking just CAP_MMAP is fine here, since the real work * is done in the FreeBSD mmap(). */ error = fget(td, bsd_args.fd, cap_rights_init(&rights, CAP_MMAP), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EINVAL); } /* Linux mmap() just fails for O_WRONLY files */ if (!(fp->f_flag & FREAD)) { fdrop(fp, td); return (EACCES); } fdrop(fp, td); } if (flags & LINUX_MAP_GROWSDOWN) { /* * The Linux MAP_GROWSDOWN option does not limit auto * growth of the region. Linux mmap with this option * takes as addr the inital BOS, and as len, the initial * region size. It can then grow down from addr without * limit. However, linux threads has an implicit internal * limit to stack size of STACK_SIZE. Its just not * enforced explicitly in linux. But, here we impose * a limit of (STACK_SIZE - GUARD_SIZE) on the stack * region, since we can do this with our mmap. * * Our mmap with MAP_STACK takes addr as the maximum * downsize limit on BOS, and as len the max size of * the region. It them maps the top SGROWSIZ bytes, * and auto grows the region down, up to the limit * in addr. * * If we don't use the MAP_STACK option, the effect * of this code is to allocate a stack region of a * fixed size of (STACK_SIZE - GUARD_SIZE). */ if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) { /* * Some linux apps will attempt to mmap * thread stacks near the top of their * address space. If their TOS is greater * than vm_maxsaddr, vm_map_growstack() * will confuse the thread stack with the * process stack and deliver a SEGV if they * attempt to grow the thread stack past their * current stacksize rlimit. To avoid this, * adjust vm_maxsaddr upwards to reflect * the current stacksize rlimit rather * than the maximum possible stacksize. * It would be better to adjust the * mmap'ed region, but some apps do not check * mmap's return value. */ PROC_LOCK(p); p->p_vmspace->vm_maxsaddr = (char *)USRSTACK - lim_cur(p, RLIMIT_STACK); PROC_UNLOCK(p); } /* * This gives us our maximum stack size and a new BOS. * If we're using VM_STACK, then mmap will just map * the top SGROWSIZ bytes, and let the stack grow down * to the limit at BOS. If we're not using VM_STACK * we map the full stack, since we don't have a way * to autogrow it. */ if (len > STACK_SIZE - GUARD_SIZE) { bsd_args.addr = (caddr_t)PTRIN(addr); bsd_args.len = len; } else { bsd_args.addr = (caddr_t)PTRIN(addr) - (STACK_SIZE - GUARD_SIZE - len); bsd_args.len = STACK_SIZE - GUARD_SIZE; } } else { bsd_args.addr = (caddr_t)PTRIN(addr); bsd_args.len = len; } bsd_args.pos = pos; #ifdef DEBUG if (ldebug(mmap)) printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n", __func__, (void *)bsd_args.addr, bsd_args.len, bsd_args.prot, bsd_args.flags, bsd_args.fd, (int)bsd_args.pos); #endif error = sys_mmap(td, &bsd_args); #ifdef DEBUG if (ldebug(mmap)) printf("-> %s() return: 0x%x (0x%08x)\n", __func__, error, (u_int)td->td_retval[0]); #endif return (error); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { struct mprotect_args bsd_args; bsd_args.addr = uap->addr; bsd_args.len = uap->len; bsd_args.prot = uap->prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; return (sys_mprotect(td, &bsd_args)); } int linux_ioperm(struct thread *td, struct linux_ioperm_args *args) { int error; struct i386_ioperm_args iia; iia.start = args->start; iia.length = args->length; iia.enable = args->enable; error = i386_set_ioperm(td, &iia); return (error); } int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; if (args->level < 0 || args->level > 3) return (EINVAL); if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) | (args->level * (PSL_IOPL / 3)); return (0); } int linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap) { int error; struct i386_ldt_args ldt; struct l_descriptor ld; union descriptor desc; int size, written; switch (uap->func) { case 0x00: /* read_ldt */ ldt.start = 0; ldt.descs = uap->ptr; ldt.num = uap->bytecount / sizeof(union descriptor); error = i386_get_ldt(td, &ldt); td->td_retval[0] *= sizeof(union descriptor); break; case 0x02: /* read_default_ldt = 0 */ size = 5*sizeof(struct l_desc_struct); if (size > uap->bytecount) size = uap->bytecount; for (written = error = 0; written < size && error == 0; written++) error = subyte((char *)uap->ptr + written, 0); td->td_retval[0] = written; break; case 0x01: /* write_ldt */ case 0x11: /* write_ldt */ if (uap->bytecount != sizeof(ld)) return (EINVAL); error = copyin(uap->ptr, &ld, sizeof(ld)); if (error) return (error); ldt.start = ld.entry_number; ldt.descs = &desc; ldt.num = 1; desc.sd.sd_lolimit = (ld.limit & 0x0000ffff); desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16; desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff); desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24; desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) | (ld.contents << 2); desc.sd.sd_dpl = 3; desc.sd.sd_p = (ld.seg_not_present ^ 1); desc.sd.sd_xx = 0; desc.sd.sd_def32 = ld.seg_32bit; desc.sd.sd_gran = ld.limit_in_pages; error = i386_set_ldt(td, &ldt, &desc); break; default: error = ENOSYS; break; } if (error == EOPNOTSUPP) { printf("linux: modify_ldt needs kernel option USER_LDT\n"); error = ENOSYS; } return (error); } int linux_sigaction(struct thread *td, struct linux_sigaction_args *args) { l_osigaction_t osa; l_sigaction_t act, oact; int error; #ifdef DEBUG if (ldebug(sigaction)) printf(ARGS(sigaction, "%d, %p, %p"), args->sig, (void *)args->nsa, (void *)args->osa); #endif if (args->nsa != NULL) { error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); if (error) return (error); act.lsa_handler = osa.lsa_handler; act.lsa_flags = osa.lsa_flags; act.lsa_restorer = osa.lsa_restorer; LINUX_SIGEMPTYSET(act.lsa_mask); act.lsa_mask.__bits[0] = osa.lsa_mask; } error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, args->osa ? &oact : NULL); if (args->osa != NULL && !error) { osa.lsa_handler = oact.lsa_handler; osa.lsa_flags = oact.lsa_flags; osa.lsa_restorer = oact.lsa_restorer; osa.lsa_mask = oact.lsa_mask.__bits[0]; error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); } return (error); } /* * Linux has two extra args, restart and oldmask. We dont use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ int linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) { sigset_t sigmask; l_sigset_t mask; #ifdef DEBUG if (ldebug(sigsuspend)) printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask); #endif LINUX_SIGEMPTYSET(mask); mask.__bits[0] = args->mask; linux_to_bsd_sigset(&mask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) { l_sigset_t lmask; sigset_t sigmask; int error; #ifdef DEBUG if (ldebug(rt_sigsuspend)) printf(ARGS(rt_sigsuspend, "%p, %d"), (void *)uap->newset, uap->sigsetsize); #endif if (uap->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); if (error) return (error); linux_to_bsd_sigset(&lmask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_pause(struct thread *td, struct linux_pause_args *args) { struct proc *p = td->td_proc; sigset_t sigmask; #ifdef DEBUG if (ldebug(pause)) printf(ARGS(pause, "")); #endif PROC_LOCK(p); sigmask = td->td_sigmask; PROC_UNLOCK(p); return (kern_sigsuspend(td, sigmask)); } int linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) { stack_t ss, oss; l_stack_t lss; int error; #ifdef DEBUG if (ldebug(sigaltstack)) printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss); #endif if (uap->uss != NULL) { error = copyin(uap->uss, &lss, sizeof(l_stack_t)); if (error) return (error); ss.ss_sp = lss.ss_sp; ss.ss_size = lss.ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); } error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, (uap->uoss != NULL) ? &oss : NULL); if (!error && uap->uoss != NULL) { lss.ss_sp = oss.ss_sp; lss.ss_size = oss.ss_size; lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); } return (error); } int linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) { struct ftruncate_args sa; #ifdef DEBUG if (ldebug(ftruncate64)) printf(ARGS(ftruncate64, "%u, %jd"), args->fd, (intmax_t)args->length); #endif sa.fd = args->fd; sa.length = args->length; return sys_ftruncate(td, &sa); } int linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args) { struct l_user_desc info; int error; int idx; int a[2]; struct segment_descriptor sd; error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); #ifdef DEBUG if (ldebug(set_thread_area)) printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"), info.entry_number, info.base_addr, info.limit, info.seg_32bit, info.contents, info.read_exec_only, info.limit_in_pages, info.seg_not_present, info.useable); #endif idx = info.entry_number; /* * Semantics of linux version: every thread in the system has array of * 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This * syscall loads one of the selected tls decriptors with a value and * also loads GDT descriptors 6, 7 and 8 with the content of the * per-thread descriptors. * * Semantics of fbsd version: I think we can ignore that linux has 3 * per-thread descriptors and use just the 1st one. The tls_array[] * is used only in set/get-thread_area() syscalls and for loading the * GDT descriptors. In fbsd we use just one GDT descriptor for TLS so * we will load just one. * * XXX: this doesn't work when a user space process tries to use more * than 1 TLS segment. Comment in the linux sources says wine might do * this. */ /* * we support just GLIBC TLS now * we should let 3 proceed as well because we use this segment so * if code does two subsequent calls it should succeed */ if (idx != 6 && idx != -1 && idx != 3) return (EINVAL); /* * we have to copy out the GDT entry we use * FreeBSD uses GDT entry #3 for storing %gs so load that * * XXX: what if a user space program doesn't check this value and tries * to use 6, 7 or 8? */ idx = info.entry_number = 3; error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (error); if (LINUX_LDT_empty(&info)) { a[0] = 0; a[1] = 0; } else { a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); } memcpy(&sd, &a, sizeof(a)); #ifdef DEBUG if (ldebug(set_thread_area)) printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase, sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit, sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx, sd.sd_def32, sd.sd_gran); #endif /* this is taken from i386 version of cpu_set_user_tls() */ critical_enter(); /* set %gs */ td->td_pcb->pcb_gsd = sd; PCPU_GET(fsgs_gdt)[1] = sd; load_gs(GSEL(GUGS_SEL, SEL_UPL)); critical_exit(); return (0); } int linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args) { struct l_user_desc info; int error; int idx; struct l_desc_struct desc; struct segment_descriptor sd; #ifdef DEBUG if (ldebug(get_thread_area)) printf(ARGS(get_thread_area, "%p"), args->desc); #endif error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); idx = info.entry_number; /* XXX: I am not sure if we want 3 to be allowed too. */ if (idx != 6 && idx != 3) return (EINVAL); idx = 3; memset(&info, 0, sizeof(info)); sd = PCPU_GET(fsgs_gdt)[1]; memcpy(&desc, &sd, sizeof(desc)); info.entry_number = idx; info.base_addr = LINUX_GET_BASE(&desc); info.limit = LINUX_GET_LIMIT(&desc); info.seg_32bit = LINUX_GET_32BIT(&desc); info.contents = LINUX_GET_CONTENTS(&desc); info.read_exec_only = !LINUX_GET_WRITABLE(&desc); info.limit_in_pages = LINUX_GET_LIMIT_PAGES(&desc); info.seg_not_present = !LINUX_GET_PRESENT(&desc); info.useable = LINUX_GET_USEABLE(&desc); error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (EFAULT); return (0); } /* XXX: this wont work with module - convert it */ int linux_mq_open(struct thread *td, struct linux_mq_open_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_open(td, (struct kmq_open_args *) args); #else return (ENOSYS); #endif } int linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_unlink(td, (struct kmq_unlink_args *) args); #else return (ENOSYS); #endif } int linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_timedsend(td, (struct kmq_timedsend_args *) args); #else return (ENOSYS); #endif } int linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_timedreceive(td, (struct kmq_timedreceive_args *) args); #else return (ENOSYS); #endif } int linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_notify(td, (struct kmq_notify_args *) args); #else return (ENOSYS); #endif } int linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args) { #ifdef P1003_1B_MQUEUE return sys_kmq_setattr(td, (struct kmq_setattr_args *) args); #else return (ENOSYS); #endif } int linux_wait4(struct thread *td, struct linux_wait4_args *args) { int error, options; struct rusage ru, *rup; #ifdef DEBUG if (ldebug(wait4)) printf(ARGS(wait4, "%d, %p, %d, %p"), args->pid, (void *)args->status, args->options, (void *)args->rusage); #endif options = (args->options & (WNOHANG | WUNTRACED)); /* WLINUXCLONE should be equal to __WCLONE, but we make sure */ if (args->options & __WCLONE) options |= WLINUXCLONE; if (args->rusage != NULL) rup = &ru; else rup = NULL; error = linux_common_wait(td, args->pid, args->status, options, rup); if (error) return (error); if (args->rusage != NULL) error = copyout(&ru, args->rusage, sizeof(ru)); return (error); } Index: head/sys/i386/linux/linux_sysvec.c =================================================================== --- head/sys/i386/linux/linux_sysvec.c (revision 283382) +++ head/sys/i386/linux/linux_sysvec.c (revision 283383) @@ -1,1159 +1,1161 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * Allow the sendsig functions to use the ldebug() facility * even though they are not syscalls themselves. Map them * to syscall 0. This is slightly less bogus than using * ldebug(sigreturn). */ #define LINUX_SYS_linux_rt_sendsig 0 #define LINUX_SYS_linux_sendsig 0 #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) extern char linux_sigcode[]; extern int linux_szsigcode; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); SET_DECLARE(linux_device_handler_set, struct linux_device_handler); static int linux_fixup(register_t **stack_base, struct image_params *iparams); static int elf_linux_fixup(register_t **stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack); static register_t *linux_copyout_strings(struct image_params *imgp); static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static int linux_szplatform; const char *linux_platform; static eventhandler_tag linux_exit_tag; static eventhandler_tag linux_exec_tag; +static eventhandler_tag linux_thread_dtor_tag; /* * Linux syscalls return negative errno's, we do positive and map them * Reference: * FreeBSD: src/sys/sys/errno.h * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h * linux-2.6.17.8/include/asm-generic/errno.h */ static int bsd_to_linux_errno[ELAST + 1] = { -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, -72, -67, -71 }; int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 0, LINUX_SIGUSR1, LINUX_SIGUSR2 }; int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { SIGHUP, SIGINT, SIGQUIT, SIGILL, SIGTRAP, SIGABRT, SIGBUS, SIGFPE, SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, SIGPIPE, SIGALRM, SIGTERM, SIGBUS, SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, SIGIO, SIGURG, SIGSYS }; #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)args->argc + 1); (*stack_base)--; suword(*stack_base, (intptr_t)(void *)envp); (*stack_base)--; suword(*stack_base, (intptr_t)(void *)argv); (*stack_base)--; suword(*stack_base, imgp->args->argc); return (0); } static int elf_linux_fixup(register_t **stack_base, struct image_params *imgp) { struct proc *p; Elf32_Auxargs *args; Elf32_Addr *uplatform; struct ps_strings *arginfo; register_t *pos; KASSERT(curthread->td_proc == imgp->proc, ("unsafe elf_linux_fixup(), should be curproc")); p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform); args = (Elf32_Auxargs *)imgp->auxargs; pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, 0); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(uplatform)); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; (*stack_base)--; suword(*stack_base, (register_t)imgp->args->argc); return (0); } /* * Copied from kern/kern_exec.c */ static register_t * linux_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * install LINUX_PLATFORM */ copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform), linux_szplatform); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (LINUX_AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } extern unsigned long linux_sznonrtsigcode; static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp; frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig, code; int oonstack, i; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = ksi->ksi_signo; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) frame.sf_extramask[i] = lmask.__bits[i+1]; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = p->p_sysent->sv_sigcode_base; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; sigset_t bmask; int eflags, i; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(sigreturn)) printf(ARGS(sigreturn, "%p"), (void *)args->sfp); #endif /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return(EINVAL); } lmask.__bits[0] = frame.sf_sc.sc_mask; for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) lmask.__bits[i+1] = frame.sf_extramask[i]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp); #endif /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* * Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = lss->ss_sp; ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); #ifdef DEBUG if (ldebug(rt_sigreturn)) printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"), ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask); #endif (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; p = td->td_proc; frame = td->td_frame; sa->code = frame->tf_eax; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } /* * If a linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a linux * binary is doing the exec, so we do not create an EXEC module for it. */ static int exec_linux_imgact_try(struct image_params *iparams); static int exec_linux_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an alternate * path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } static void linux_get_machine(const char **dst) { switch (cpu_class) { case CPUCLASS_686: *dst = "i686"; break; case CPUCLASS_586: *dst = "i586"; break; case CPUCLASS_486: *dst = "i486"; break; default: *dst = "i386"; } } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_sigsize = LINUX_SIGTBLSZ, .sv_sigtbl = bsd_to_linux_signal, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = linux_sigcode, .sv_szsigcode = &linux_szsigcode, .sv_prepsyscall = NULL, .sv_name = "Linux a.out", .sv_coredump = NULL, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, + .sv_thread_detach = linux_thread_detach, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_mask = 0, .sv_sigsize = LINUX_SIGTBLSZ, .sv_sigtbl = bsd_to_linux_signal, .sv_errsize = ELAST + 1, .sv_errtbl = bsd_to_linux_errno, .sv_transtrap = translate_traps, .sv_fixup = elf_linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = linux_sigcode, .sv_szsigcode = &linux_szsigcode, .sv_prepsyscall = NULL, .sv_name = "Linux ELF", .sv_coredump = elf32_coredump, .sv_imgact_try = exec_linux_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = exec_linux_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, + .sv_thread_detach = linux_thread_detach, }; INIT_SYSENTVEC(elf_sysvec, &elf_linux_sysvec); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (FALSE); /* * For linux we encode osrel as follows (see linux_mib.c): * VVVMMMIII (version, major, minor), see linux_mib.c. */ *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; return (TRUE); } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = "/compat/linux", .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; struct linux_device_handler **ldhp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); - mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF); - sx_init(&emul_shared_lock, "emuldata->shared lock"); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit, NULL, 1000); linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec, NULL, 1000); + linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, + linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY); linux_get_machine(&linux_platform); linux_szplatform = roundup(strlen(linux_platform) + 1, sizeof(char *)); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); - mtx_destroy(&emul_lock); - sx_destroy(&emul_shared_lock); mtx_destroy(&futex_mtx); EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag); EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag); + EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return EOPNOTSUPP; } return error; } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);