diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index 05afcdfcd045..943df00328fc 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -1,875 +1,871 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * Copyright (c) 2013, 2021 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux64, 1); #define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \ LINUX_VDSOPAGE_SIZE) #define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48 #define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \ sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux_vdso_so_o_start; extern char _binary_linux_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; extern const char *linux_syscallnames[]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static void linux_set_fork_retval(struct thread *td); static int linux_vsyscall(struct thread *td); LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode); LINUX_VDSO_SYM_CHAR(linux_platform); LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); LINUX_VDSO_SYM_INTPTR(kern_cpu_selector); /* * According to the Intel x86 ISA 64-bit syscall * saves %rip to %rcx and rflags to %r11. Registers on syscall entry: * %rax system call number * %rcx return address * %r11 saved rflags * %rdi arg1 * %rsi arg2 * %rdx arg3 * %r10 arg4 * %r8 arg5 * %r9 arg6 * * Then FreeBSD fast_syscall() move registers: * %rcx -> trapframe.tf_rip * %r10 -> trapframe.tf_rcx */ static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; sa->code = frame->tf_rax; sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; /* Restore r10 earlier to avoid doing this multiply times. */ frame->tf_r10 = frame->tf_rcx; /* Restore %rcx for machine context. */ frame->tf_rcx = frame->tf_rip; td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_rax = td->td_retval[0]; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes, * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. * We saved this in tf_err. * */ frame->tf_rip -= frame->tf_err; break; case EJUSTRETURN: break; default: frame->tf_rax = bsd_to_linux_errno(error); break; } /* * Differently from FreeBSD native ABI, on Linux only %rcx * and %r11 values are not preserved across the syscall. * Require full context restore to get all registers except * those two restored at return to usermode. */ set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } static void linux_set_fork_retval(struct thread *td) { struct trapframe *frame = td->td_frame; frame->tf_rax = 0; } void linux64_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos) { AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP2, 0); AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform)); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; set_pcb_flags(pcb, PCB_FULL_IRET); saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } /* * Copied from amd64/amd64/machdep.c */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_rt_sigframe sf; struct l_sigcontext *context; struct trapframe *regs; mcontext_t mc; unsigned long rflags; sigset_t bmask; int error, i; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &sf, sizeof(sf)); if (error != 0) return (error); p = td->td_proc; context = &sf.sf_uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { uprintf("pid %d comm %s linux mangled rflags %#lx\n", p->p_pid, p->p_comm, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(context->sc_cs)) { uprintf("pid %d comm %s linux mangled cs %#x\n", p->p_pid, p->p_comm, context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&sf.sf_uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; if (sf.sf_uc.uc_mcontext.sc_fpstate != NULL) { struct savefpu *svfp = (struct savefpu *)mc.mc_fpstate; bzero(&mc, sizeof(mc)); mc.mc_ownedfp = _MC_FPOWNED_FPU; mc.mc_fpformat = _MC_FPFMT_XMM; svfp->sv_env.en_cw = sf.sf_fs.cwd; svfp->sv_env.en_sw = sf.sf_fs.swd; svfp->sv_env.en_tw = sf.sf_fs.twd; svfp->sv_env.en_opcode = sf.sf_fs.fop; svfp->sv_env.en_rip = sf.sf_fs.rip; svfp->sv_env.en_rdp = sf.sf_fs.rdp; svfp->sv_env.en_mxcsr = sf.sf_fs.mxcsr; svfp->sv_env.en_mxcsr_mask = sf.sf_fs.mxcsr_mask; /* FPU registers */ for (i = 0; i < nitems(svfp->sv_fp); ++i) bcopy(&sf.sf_fs.st[i], svfp->sv_fp[i].fp_acc.fp_bytes, sizeof(svfp->sv_fp[i].fp_acc.fp_bytes)); /* SSE registers */ for (i = 0; i < nitems(svfp->sv_xmm); ++i) bcopy(&sf.sf_fs.xmm[i], svfp->sv_xmm[i].xmm_bytes, sizeof(svfp->sv_xmm[i].xmm_bytes)); error = set_fpcontext(td, &mc, NULL, 0); if (error != 0) { uprintf("pid %d comm %s linux can't restore fpu state %d\n", p->p_pid, p->p_comm, error); return (error); } } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; struct savefpu *svfp; mcontext_t mc; int sig, code; int oonstack, issiginfo, i; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno); psp = p->p_sigacts; issiginfo = SIGISMEMBER(psp->ps_siginfo, sig); code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); bzero(&sf, sizeof(sf)); sf.sf_uc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; } else sp = (caddr_t)regs->tf_rsp - 128; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Make room, keeping the stack aligned. */ sp -= sizeof(struct l_rt_sigframe); sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); /* Save user context. */ bsd_to_linux_sigset(mask, &sf.sf_uc.uc_sigmask); sf.sf_uc.uc_mcontext.sc_mask = sf.sf_uc.uc_sigmask; sf.sf_uc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_uc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_uc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_uc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_uc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_uc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_uc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_uc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_uc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_uc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_uc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_uc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_uc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_uc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_uc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_uc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_uc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_uc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.sc_err = regs->tf_err; sf.sf_uc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_uc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; get_fpcontext(td, &mc, NULL, NULL); KASSERT(mc.mc_fpformat != _MC_FPFMT_NODEV, ("fpu not present")); svfp = (struct savefpu *)mc.mc_fpstate; sf.sf_fs.cwd = svfp->sv_env.en_cw; sf.sf_fs.swd = svfp->sv_env.en_sw; sf.sf_fs.twd = svfp->sv_env.en_tw; sf.sf_fs.fop = svfp->sv_env.en_opcode; sf.sf_fs.rip = svfp->sv_env.en_rip; sf.sf_fs.rdp = svfp->sv_env.en_rdp; sf.sf_fs.mxcsr = svfp->sv_env.en_mxcsr; sf.sf_fs.mxcsr_mask = svfp->sv_env.en_mxcsr_mask; /* FPU registers */ for (i = 0; i < nitems(svfp->sv_fp); ++i) bcopy(svfp->sv_fp[i].fp_acc.fp_bytes, &sf.sf_fs.st[i], sizeof(svfp->sv_fp[i].fp_acc.fp_bytes)); /* SSE registers */ for (i = 0; i < nitems(svfp->sv_xmm); ++i) bcopy(svfp->sv_xmm[i].xmm_bytes, &sf.sf_fs.xmm[i], sizeof(svfp->sv_xmm[i].xmm_bytes)); sf.sf_uc.uc_mcontext.sc_fpstate = (struct l_fpstate *)((caddr_t)sfp + offsetof(struct l_rt_sigframe, sf_fs)); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig); /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { uprintf("pid %d comm %s has trashed its stack, killing\n", p->p_pid, p->p_comm); PROC_LOCK(p); sigexit(td, SIGILL); } fpstate_drop(td); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; if (issiginfo) { regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ } else { regs->tf_rsi = 0; regs->tf_rdx = 0; } regs->tf_rcx = (register_t)catcher; regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #define LINUX_VSYSCALL_START (-10UL << 20) #define LINUX_VSYSCALL_SZ 1024 const unsigned long linux_vsyscall_vector[] = { LINUX_SYS_gettimeofday, LINUX_SYS_linux_time, LINUX_SYS_linux_getcpu, }; static int linux_vsyscall(struct thread *td) { struct trapframe *frame; uint64_t retqaddr; int code, traced; int error; frame = td->td_frame; /* Check %rip for vsyscall area. */ if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) return (EINVAL); if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) return (EINVAL); code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; if (code >= nitems(linux_vsyscall_vector)) return (EINVAL); /* * vsyscall called as callq *(%rax), so we must * use return address from %rsp and also fixup %rsp. */ error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); if (error) return (error); frame->tf_rip = retqaddr; frame->tf_rax = linux_vsyscall_vector[code]; frame->tf_rsp += 8; traced = (frame->tf_flags & PSL_T); amd64_syscall(td, traced); return (0); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_fixup = __elfN(freebsd_fixup), .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, - .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, .sv_usrstack = LINUX_USRSTACK_LA48, .sv_psstrings = LINUX_PS_STRINGS_LA48, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __linuxN(copyout_auxargs), .sv_copyout_strings = __linuxN(copyout_strings), .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = linux_syscallnames, .sv_shared_page_base = LINUX_SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, .sv_hwcap = NULL, .sv_hwcap2 = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, .sv_set_fork_retval = linux_set_fork_retval, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error; error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); if (error == 0) - linux_on_exec(p, imgp); + error = linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; amd64_lower_shared_page(sv); /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector); tkoff = kern_cpu_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_cpu_selector_idx(); if (bootverbose) printf("Linux x86-64 vDSO cpu_selector: %lu\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux_vdso_so_o_start; char *vdso_end = &_binary_linux_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); linux_vdso_base = LINUX_VDSOPAGE_LA48; if (hw_lower_amd64_sharedpage != 0) linux_vdso_base -= PAGE_SIZE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Ehdr *ehdr; const Elf_Shdr *shdr; Elf64_Addr *where, val; Elf_Size rtype, symidx; const Elf_Rela *rela; Elf_Addr addr, addend; int relacnt; int i, j; MPASS(offset != 0); relacnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: printf("Linux x86_64 vDSO: unexpected Rel section\n"); break; case SHT_RELA: rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); relacnt = shdr[i].sh_size / sizeof(*rela); } } for (j = 0; j < relacnt; j++, rela++) { where = (Elf_Addr *)(mapping + rela->r_offset); addend = rela->r_addend; rtype = ELF_R_TYPE(rela->r_info); symidx = ELF_R_SYM(rela->r_info); switch (rtype) { case R_X86_64_NONE: /* none */ break; case R_X86_64_RELATIVE: /* B + A */ addr = (Elf_Addr)(offset + addend); val = addr; if (*where != val) *where = val; break; case R_X86_64_IRELATIVE: printf("Linux x86_64 vDSO: unexpected ifunc relocation, " "symbol index %ld\n", symidx); break; default: printf("Linux x86_64 vDSO: unexpected relocation type %ld, " "symbol index %ld\n", rtype, symidx); } } } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-x86_64.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE | LINUX_BI_FUTEX_REQUEUE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux x86-64 ELF exec handler installed\n"); } else printf("cannot insert Linux x86-64 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux x86_64 ELF exec handler removed\n"); } else printf("Could not deinstall Linux x86_64 ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "Linux 64bit support"); diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c index 23d8f8767282..4e5d6eb55fc6 100644 --- a/sys/amd64/linux32/linux32_machdep.c +++ b/sys/amd64/linux32/linux32_machdep.c @@ -1,762 +1,754 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru); struct l_old_select_argv { l_int nfds; l_uintptr_t readfds; l_uintptr_t writefds; l_uintptr_t exceptfds; l_uintptr_t timeout; } __packed; static void bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru) { lru->ru_utime.tv_sec = ru->ru_utime.tv_sec; lru->ru_utime.tv_usec = ru->ru_utime.tv_usec; lru->ru_stime.tv_sec = ru->ru_stime.tv_sec; lru->ru_stime.tv_usec = ru->ru_stime.tv_usec; lru->ru_maxrss = ru->ru_maxrss; lru->ru_ixrss = ru->ru_ixrss; lru->ru_idrss = ru->ru_idrss; lru->ru_isrss = ru->ru_isrss; lru->ru_minflt = ru->ru_minflt; lru->ru_majflt = ru->ru_majflt; lru->ru_nswap = ru->ru_nswap; lru->ru_inblock = ru->ru_inblock; lru->ru_oublock = ru->ru_oublock; lru->ru_msgsnd = ru->ru_msgsnd; lru->ru_msgrcv = ru->ru_msgrcv; lru->ru_nsignals = ru->ru_nsignals; lru->ru_nvcsw = ru->ru_nvcsw; lru->ru_nivcsw = ru->ru_nivcsw; } int linux_copyout_rusage(struct rusage *ru, void *uaddr) { struct l_rusage lru; bsd_to_linux_rusage(ru, &lru); return (copyout(&lru, uaddr, sizeof(struct l_rusage))); } int linux_execve(struct thread *td, struct linux_execve_args *args) { struct image_args eargs; - char *path; int error; - if (!LUSECONVPATH(td)) { - error = freebsd32_exec_copyin_args(&eargs, args->path, UIO_USERSPACE, - args->argp, args->envp); - } else { - LCONVPATHEXIST(args->path, &path); - error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE, - args->argp, args->envp); - LFREEPATH(path); - } + error = freebsd32_exec_copyin_args(&eargs, args->path, UIO_USERSPACE, + args->argp, args->envp); if (error == 0) error = linux_common_execve(td, &eargs); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } CTASSERT(sizeof(struct l_iovec32) == 8); int linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop) { struct l_iovec32 iov32; struct iovec *iov; struct uio *uio; uint32_t iovlen; int error, i; *uiop = NULL; if (iovcnt > UIO_MAXIOV) return (EINVAL); iovlen = iovcnt * sizeof(struct iovec); uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK); iov = (struct iovec *)(uio + 1); for (i = 0; i < iovcnt; i++) { error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32)); if (error) { free(uio, M_IOV); return (error); } iov[i].iov_base = PTRIN(iov32.iov_base); iov[i].iov_len = iov32.iov_len; } uio->uio_iov = iov; uio->uio_iovcnt = iovcnt; uio->uio_segflg = UIO_USERSPACE; uio->uio_offset = -1; uio->uio_resid = 0; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > INT_MAX - uio->uio_resid) { free(uio, M_IOV); return (EINVAL); } uio->uio_resid += iov->iov_len; iov++; } *uiop = uio; return (0); } int linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, int error) { struct l_iovec32 iov32; struct iovec *iov; uint32_t iovlen; int i; *iovp = NULL; if (iovcnt > UIO_MAXIOV) return (error); iovlen = iovcnt * sizeof(struct iovec); iov = malloc(iovlen, M_IOV, M_WAITOK); for (i = 0; i < iovcnt; i++) { error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32)); if (error) { free(iov, M_IOV); return (error); } iov[i].iov_base = PTRIN(iov32.iov_base); iov[i].iov_len = iov32.iov_len; } *iovp = iov; return(0); } int linux_readv(struct thread *td, struct linux_readv_args *uap) { struct uio *auio; int error; error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int linux_writev(struct thread *td, struct linux_writev_args *uap) { struct uio *auio; int error; error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } struct l_ipc_kludge { l_uintptr_t msgp; l_long msgtyp; } __packed; int linux_ipc(struct thread *td, struct linux_ipc_args *args) { switch (args->what & 0xFFFF) { case LINUX_SEMOP: { return (kern_semop(td, args->arg1, PTRIN(args->ptr), args->arg2, NULL)); } case LINUX_SEMGET: { struct linux_semget_args a; a.key = args->arg1; a.nsems = args->arg2; a.semflg = args->arg3; return (linux_semget(td, &a)); } case LINUX_SEMCTL: { struct linux_semctl_args a; int error; a.semid = args->arg1; a.semnum = args->arg2; a.cmd = args->arg3; error = copyin(PTRIN(args->ptr), &a.arg, sizeof(a.arg)); if (error) return (error); return (linux_semctl(td, &a)); } case LINUX_SEMTIMEDOP: { struct linux_semtimedop_args a; a.semid = args->arg1; a.tsops = PTRIN(args->ptr); a.nsops = args->arg2; a.timeout = PTRIN(args->arg5); return (linux_semtimedop(td, &a)); } case LINUX_MSGSND: { struct linux_msgsnd_args a; a.msqid = args->arg1; a.msgp = PTRIN(args->ptr); a.msgsz = args->arg2; a.msgflg = args->arg3; return (linux_msgsnd(td, &a)); } case LINUX_MSGRCV: { struct linux_msgrcv_args a; a.msqid = args->arg1; a.msgsz = args->arg2; a.msgflg = args->arg3; if ((args->what >> 16) == 0) { struct l_ipc_kludge tmp; int error; if (args->ptr == 0) return (EINVAL); error = copyin(PTRIN(args->ptr), &tmp, sizeof(tmp)); if (error) return (error); a.msgp = PTRIN(tmp.msgp); a.msgtyp = tmp.msgtyp; } else { a.msgp = PTRIN(args->ptr); a.msgtyp = args->arg5; } return (linux_msgrcv(td, &a)); } case LINUX_MSGGET: { struct linux_msgget_args a; a.key = args->arg1; a.msgflg = args->arg2; return (linux_msgget(td, &a)); } case LINUX_MSGCTL: { struct linux_msgctl_args a; a.msqid = args->arg1; a.cmd = args->arg2; a.buf = PTRIN(args->ptr); return (linux_msgctl(td, &a)); } case LINUX_SHMAT: { struct linux_shmat_args a; l_uintptr_t addr; int error; a.shmid = args->arg1; a.shmaddr = PTRIN(args->ptr); a.shmflg = args->arg2; error = linux_shmat(td, &a); if (error != 0) return (error); addr = td->td_retval[0]; error = copyout(&addr, PTRIN(args->arg3), sizeof(addr)); td->td_retval[0] = 0; return (error); } case LINUX_SHMDT: { struct linux_shmdt_args a; a.shmaddr = PTRIN(args->ptr); return (linux_shmdt(td, &a)); } case LINUX_SHMGET: { struct linux_shmget_args a; a.key = args->arg1; a.size = args->arg2; a.shmflg = args->arg3; return (linux_shmget(td, &a)); } case LINUX_SHMCTL: { struct linux_shmctl_args a; a.shmid = args->arg1; a.cmd = args->arg2; a.buf = PTRIN(args->ptr); return (linux_shmctl(td, &a)); } default: break; } return (EINVAL); } int linux_old_select(struct thread *td, struct linux_old_select_args *args) { struct l_old_select_argv linux_args; struct linux_select_args newsel; int error; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); newsel.nfds = linux_args.nfds; newsel.readfds = PTRIN(linux_args.readfds); newsel.writefds = PTRIN(linux_args.writefds); newsel.exceptfds = PTRIN(linux_args.exceptfds); newsel.timeout = PTRIN(linux_args.timeout); return (linux_select(td, &newsel)); } int linux_set_cloned_tls(struct thread *td, void *desc) { struct l_user_desc info; struct pcb *pcb; int error; error = copyin(desc, &info, sizeof(struct l_user_desc)); if (error) { linux_msg(td, "set_cloned_tls copyin info failed!"); } else { /* We might copy out the entry_number as GUGS32_SEL. */ info.entry_number = GUGS32_SEL; error = copyout(&info, desc, sizeof(struct l_user_desc)); if (error) linux_msg(td, "set_cloned_tls copyout info failed!"); pcb = td->td_pcb; update_pcb_bases(pcb); pcb->pcb_gsbase = (register_t)info.base_addr; td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); } return (error); } int linux_set_upcall(struct thread *td, register_t stack) { if (stack) td->td_frame->tf_rsp = stack; /* * The newly created Linux thread returns * to the user space by the same path that a parent do. */ td->td_frame->tf_rax = 0; return (0); } int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot, args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * PAGE_SIZE)); } int linux_mmap(struct thread *td, struct linux_mmap_args *args) { int error; struct l_mmap_argv linux_args; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); return (linux_mmap_common(td, linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, (uint32_t)linux_args.pgoff)); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len, uap->prot)); } int linux_madvise(struct thread *td, struct linux_madvise_args *uap) { return (linux_madvise_common(td, PTROUT(uap->addr), uap->len, uap->behav)); } int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; if (args->level < 0 || args->level > 3) return (EINVAL); if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | (args->level * (PSL_IOPL / 3)); return (0); } int linux_sigaction(struct thread *td, struct linux_sigaction_args *args) { l_osigaction_t osa; l_sigaction_t act, oact; int error; if (args->nsa != NULL) { error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); if (error) return (error); act.lsa_handler = osa.lsa_handler; act.lsa_flags = osa.lsa_flags; act.lsa_restorer = osa.lsa_restorer; LINUX_SIGEMPTYSET(act.lsa_mask); act.lsa_mask.__mask = osa.lsa_mask; } error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, args->osa ? &oact : NULL); if (args->osa != NULL && !error) { osa.lsa_handler = oact.lsa_handler; osa.lsa_flags = oact.lsa_flags; osa.lsa_restorer = oact.lsa_restorer; osa.lsa_mask = oact.lsa_mask.__mask; error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); } return (error); } /* * Linux has two extra args, restart and oldmask. We don't use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ int linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) { sigset_t sigmask; l_sigset_t mask; LINUX_SIGEMPTYSET(mask); mask.__mask = args->mask; linux_to_bsd_sigset(&mask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_pause(struct thread *td, struct linux_pause_args *args) { struct proc *p = td->td_proc; sigset_t sigmask; PROC_LOCK(p); sigmask = td->td_sigmask; PROC_UNLOCK(p); return (kern_sigsuspend(td, sigmask)); } int linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap) { struct timeval atv; l_timeval atv32; struct timezone rtz; int error = 0; if (uap->tp) { microtime(&atv); atv32.tv_sec = atv.tv_sec; atv32.tv_usec = atv.tv_usec; error = copyout(&atv32, uap->tp, sizeof(atv32)); } if (error == 0 && uap->tzp != NULL) { rtz.tz_minuteswest = 0; rtz.tz_dsttime = 0; error = copyout(&rtz, uap->tzp, sizeof(rtz)); } return (error); } int linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap) { l_timeval atv32; struct timeval atv, *tvp; struct timezone atz, *tzp; int error; if (uap->tp) { error = copyin(uap->tp, &atv32, sizeof(atv32)); if (error) return (error); atv.tv_sec = atv32.tv_sec; atv.tv_usec = atv32.tv_usec; tvp = &atv; } else tvp = NULL; if (uap->tzp) { error = copyin(uap->tzp, &atz, sizeof(atz)); if (error) return (error); tzp = &atz; } else tzp = NULL; return (kern_settimeofday(td, tvp, tzp)); } int linux_getrusage(struct thread *td, struct linux_getrusage_args *uap) { struct rusage s; int error; error = kern_getrusage(td, uap->who, &s); if (error != 0) return (error); if (uap->rusage != NULL) error = linux_copyout_rusage(&s, uap->rusage); return (error); } int linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args) { struct l_user_desc info; struct pcb *pcb; int error; error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); /* * Semantics of Linux version: every thread in the system has array * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. * This syscall loads one of the selected TLS decriptors with a value * and also loads GDT descriptors 6, 7 and 8 with the content of * the per-thread descriptors. * * Semantics of FreeBSD version: I think we can ignore that Linux has * three per-thread descriptors and use just the first one. * The tls_array[] is used only in [gs]et_thread_area() syscalls and * for loading the GDT descriptors. We use just one GDT descriptor * for TLS, so we will load just one. * * XXX: This doesn't work when a user space process tries to use more * than one TLS segment. Comment in the Linux source says wine might * do this. */ /* * GLIBC reads current %gs and call set_thread_area() with it. * We should let GUDATA_SEL and GUGS32_SEL proceed as well because * we use these segments. */ switch (info.entry_number) { case GUGS32_SEL: case GUDATA_SEL: case 6: case -1: info.entry_number = GUGS32_SEL; break; default: return (EINVAL); } /* * We have to copy out the GDT entry we use. * * XXX: What if a user space program does not check the return value * and tries to use 6, 7 or 8? */ error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (error); pcb = td->td_pcb; update_pcb_bases(pcb); pcb->pcb_gsbase = (register_t)info.base_addr; update_gdt_gsbase(td, info.base_addr); return (0); } void bsd_to_linux_regset32(const struct reg32 *b_reg, struct linux_pt_regset32 *l_regset) { l_regset->ebx = b_reg->r_ebx; l_regset->ecx = b_reg->r_ecx; l_regset->edx = b_reg->r_edx; l_regset->esi = b_reg->r_esi; l_regset->edi = b_reg->r_edi; l_regset->ebp = b_reg->r_ebp; l_regset->eax = b_reg->r_eax; l_regset->ds = b_reg->r_ds; l_regset->es = b_reg->r_es; l_regset->fs = b_reg->r_fs; l_regset->gs = b_reg->r_gs; l_regset->orig_eax = b_reg->r_eax; l_regset->eip = b_reg->r_eip; l_regset->cs = b_reg->r_cs; l_regset->eflags = b_reg->r_eflags; l_regset->esp = b_reg->r_esp; l_regset->ss = b_reg->r_ss; } int futex_xchgl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_xchgl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_xchgl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_xchgl_smap : futex_xchgl_nosmap); } int futex_addl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_addl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_addl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_addl_smap : futex_addl_nosmap); } int futex_orl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_orl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_orl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_orl_smap : futex_orl_nosmap); } int futex_andl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_andl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_andl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_andl_smap : futex_andl_nosmap); } int futex_xorl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_xorl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_xorl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_xorl_smap : futex_xorl_nosmap); } int linux_ptrace_peekuser(struct thread *td, pid_t pid, void *addr, void *data) { LINUX_RATELIMIT_MSG_OPT1("PTRACE_PEEKUSER offset %ld not implemented; " "returning EINVAL", (uintptr_t)addr); return (EINVAL); } int linux_ptrace_pokeuser(struct thread *td, pid_t pid, void *addr, void *data) { LINUX_RATELIMIT_MSG_OPT1("PTRACE_POKEUSER offset %ld " "not implemented; returning EINVAL", (uintptr_t)addr); return (EINVAL); } diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 3cec5d2e9eeb..e51ae68a229d 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -1,1073 +1,1069 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE) #define LINUX32_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX32_VDSOPAGE (LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE) #define LINUX32_SHAREDPAGE (LINUX32_VDSOPAGE - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX32_USRSTACK LINUX32_SHAREDPAGE static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux32_vdso_so_o_start; extern char _binary_linux32_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; extern const char *linux32_syscallnames[]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static void linux32_fixlimit(struct rlimit *rl, int which); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux32_set_fork_retval(struct thread *td); static void linux32_set_syscall_retval(struct thread *td, int error); struct linux32_ps_strings { u_int32_t ps_argvstr; /* first of 0 or more argument strings */ u_int ps_nargvstr; /* the number of argument strings */ u_int32_t ps_envstr; /* first of 0 or more environment strings */ u_int ps_nenvstr; /* the number of environment strings */ }; #define LINUX32_PS_STRINGS (LINUX32_USRSTACK - \ sizeof(struct linux32_ps_strings)) LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall); LINUX_VDSO_SYM_INTPTR(linux32_vdso_sigcode); LINUX_VDSO_SYM_INTPTR(linux32_vdso_rt_sigcode); LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); LINUX_VDSO_SYM_INTPTR(kern_cpu_selector); LINUX_VDSO_SYM_CHAR(linux_platform); void linux32_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos) { AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP2, 0); AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform)); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno); code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_uc); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn and libgcc unwind. */ frame.sf_uc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_uc.uc_sigmask); frame.sf_uc.uc_mcontext.sc_mask = frame.sf_uc.uc_sigmask.__mask; frame.sf_uc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_uc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_uc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_uc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_uc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_uc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_uc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_uc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_uc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_uc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_uc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_uc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_uc.uc_mcontext.sc_es = regs->tf_es; frame.sf_uc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_uc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_uc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_uc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_uc.uc_mcontext.sc_err = regs->tf_err; frame.sf_uc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_uc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_vdso_rt_sigcode; regs->tf_rdi = PTROUT(catcher); regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno); code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_sig = sig; frame.sf_sigmask = *mask; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_vdso_sigcode; regs->tf_rdi = PTROUT(catcher); regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ eflags = frame.sf_sc.sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } kern_sigprocmask(td, SIG_SETMASK, &frame.sf_sigmask, NULL, 0); /* Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ eflags = context->sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void linux32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_rax = bsd_to_linux_errno(error); } } static void linux32_set_fork_retval(struct thread *td) { struct trapframe *frame = td->td_frame; frame->tf_rax = 0; } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = (register_t)imgp->ps_strings; x86_clear_dbregs(pcb); fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); } /* * XXX copied from ia32_sysvec.c. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; u_int32_t *vectp; char *stringp; uintptr_t destp, ustringp; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; arginfo = (struct linux32_ps_strings *)PROC_PS_STRINGS(imgp->proc); destp = (uintptr_t)arginfo; if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(uint32_t)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(uint32_t)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(uint32_t)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(uint32_t)); } vectp = (uint32_t *)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword32(vectp++, 0) != 0) return (EFAULT); if (suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword32(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); bool linux32_emulate_i386 = false; SYSCTL_BOOL(_compat_linux32, OID_AUTO, emulate_i386, CTLFLAG_RWTUN, &linux32_emulate_i386, 0, "Emulate the real i386"); static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_fixup = elf32_freebsd_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux32_prepare_notes, - .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_psstringssz = sizeof(struct linux32_ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __linuxN(copyout_auxargs), .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux32_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = linux32_syscallnames, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = NULL, .sv_hwcap2 = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, .sv_set_fork_retval = linux32_set_fork_retval, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error; error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX32_VDSOPAGE_SIZE, imgp); if (error == 0) - linux_on_exec(p, imgp); + error = linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); tkoff = kern_cpu_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_cpu_selector_idx(); if (bootverbose) printf("Linux i386 vDSO cpu_selector: %u\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux32_vdso_so_o_start; char *vdso_end = &_binary_linux32_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE); linux_vdso_base = LINUX32_VDSOPAGE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Shdr *shdr; const Elf_Rel *rel; const Elf_Ehdr *ehdr; Elf32_Addr *where; Elf_Size rtype, symidx; Elf32_Addr addr, addend; int i, relcnt; MPASS(offset != 0); relcnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); relcnt = shdr[i].sh_size / sizeof(*rel); break; case SHT_RELA: printf("Linux i386 vDSO: unexpected Rela section\n"); break; } } for (i = 0; i < relcnt; i++, rel++) { where = (Elf32_Addr *)(mapping + rel->r_offset); addend = *where; rtype = ELF_R_TYPE(rel->r_info); symidx = ELF_R_SYM(rel->r_info); switch (rtype) { case R_386_NONE: /* none */ break; case R_386_RELATIVE: /* B + A */ addr = (Elf32_Addr)PTROUT(offset + addend); if (*where != addr) *where = addr; break; case R_386_IRELATIVE: printf("Linux i386 vDSO: unexpected ifunc relocation, " "symbol index %ld\n", (intmax_t)symidx); break; default: printf("Linux i386 vDSO: unexpected relocation type %ld, " "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx); } } } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE | LINUX_BI_FUTEX_REQUEUE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux i386 ELF exec handler installed\n"); } else printf("cannot insert Linux i386 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux i386 ELF exec handler removed\n"); } else printf("Could not deinstall Linux i386 ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index bb9ff25893eb..3e06634117fe 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -1,607 +1,605 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1994-1996 Søren Schmidt * Copyright (c) 2018 Turing Robotic Industries Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif MODULE_VERSION(linux64elf, 1); #define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - \ LINUX_VDSOPAGE_SIZE) #define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX_USRSTACK LINUX_SHAREDPAGE #define LINUX_PS_STRINGS (LINUX_USRSTACK - \ sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux_vdso_so_o_start; extern char _binary_linux_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; extern const char *linux_syscallnames[]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /* DTrace probes */ LIN_SDT_PROBE_DEFINE0(sysvec, linux_exec_setregs, todo); LINUX_VDSO_SYM_CHAR(linux_platform); LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); LINUX_VDSO_SYM_INTPTR(__user_rt_sigreturn); static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct syscall_args *sa; register_t *ap; p = td->td_proc; ap = td->td_frame->tf_x; sa = &td->td_sa; sa->code = td->td_frame->tf_x[8]; sa->original_code = sa->code; /* LINUXTODO: generic syscall? */ if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; if (sa->callp->sy_narg > nitems(sa->args)) panic("ARM64TODO: Could we have more than %zu args?", nitems(sa->args)); memcpy(sa->args, ap, nitems(sa->args) * sizeof(register_t)); td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { td->td_retval[1] = td->td_frame->tf_x[1]; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) td->td_frame->tf_x[0] = bsd_to_linux_errno(error); } } void linux64_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos) { AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, *imgp->sysent->sv_hwcap); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP2, *imgp->sysent->sv_hwcap2); AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform)); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* LINUXTODO: validate */ LIN_SDT_PROBE0(sysvec, linux_exec_setregs, todo); memset(regs, 0, sizeof(*regs)); /* glibc start.S registers function pointer in x0 with atexit. */ regs->tf_sp = stack; #if 0 /* LINUXTODO: See if this is used. */ regs->tf_lr = imgp->entry_addr; #else regs->tf_lr = 0xffffffffffffffff; #endif regs->tf_elr = imgp->entry_addr; pcb->pcb_tpidr_el0 = 0; pcb->pcb_tpidrro_el0 = 0; WRITE_SPECIALREG(tpidrro_el0, 0); WRITE_SPECIALREG(tpidr_el0, 0); #ifdef VFP vfp_reset_state(td, pcb); #endif /* * Clear debug register state. It is not applicable to the new process. */ bzero(&pcb->pcb_dbg_regs, sizeof(pcb->pcb_dbg_regs)); } int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_sigframe *frame; ucontext_t uc; struct trapframe *tf; int error; tf = td->td_frame; frame = (struct l_sigframe *)tf->tf_sp; if (copyin((void *)&frame->uc, &uc, sizeof(uc))) return (EFAULT); error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td; struct proc *p; struct trapframe *tf; struct l_sigframe *fp, *frame; struct l_fpsimd_context *fpsimd; struct l_esr_context *esr; l_stack_t uc_stack; ucontext_t uc; uint8_t *scr; struct sigacts *psp; int onstack, sig, issiginfo; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); issiginfo = SIGISMEMBER(psp->ps_siginfo, sig); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else { fp = (struct l_sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct l_sigframe *)STACKALIGN(fp); get_mcontext(td, &uc.uc_mcontext, 0); uc.uc_sigmask = *mask; uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); uc_stack.ss_size = td->td_sigstk.ss_size; uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) != 0 ? (onstack ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Fill in the frame to copy out */ frame = malloc(sizeof(*frame), M_LINUX, M_WAITOK | M_ZERO); memcpy(&frame->sf.sf_uc.uc_sc.regs, tf->tf_x, sizeof(tf->tf_x)); frame->sf.sf_uc.uc_sc.regs[30] = tf->tf_lr; frame->sf.sf_uc.uc_sc.sp = tf->tf_sp; frame->sf.sf_uc.uc_sc.pc = tf->tf_elr; frame->sf.sf_uc.uc_sc.pstate = tf->tf_spsr; frame->sf.sf_uc.uc_sc.fault_address = (register_t)ksi->ksi_addr; /* Stack frame for unwinding */ frame->fp = tf->tf_x[29]; frame->lr = tf->tf_elr; /* Translate the signal. */ sig = bsd_to_linux_signal(sig); siginfo_to_lsiginfo(&ksi->ksi_info, &frame->sf.sf_si, sig); bsd_to_linux_sigset(mask, &frame->sf.sf_uc.uc_sigmask); /* * Prepare fpsimd & esr. Does not check sizes, as * __reserved is big enougth. */ scr = (uint8_t *)&frame->sf.sf_uc.uc_sc.__reserved; #ifdef VFP fpsimd = (struct l_fpsimd_context *) scr; fpsimd->head.magic = L_FPSIMD_MAGIC; fpsimd->head.size = sizeof(struct l_fpsimd_context); fpsimd->fpsr = uc.uc_mcontext.mc_fpregs.fp_sr; fpsimd->fpcr = uc.uc_mcontext.mc_fpregs.fp_cr; memcpy(fpsimd->vregs, &uc.uc_mcontext.mc_fpregs.fp_q, sizeof(uc.uc_mcontext.mc_fpregs.fp_q)); scr += roundup(sizeof(struct l_fpsimd_context), 16); #endif if (ksi->ksi_addr != 0) { esr = (struct l_esr_context *) scr; esr->head.magic = L_ESR_MAGIC; esr->head.size = sizeof(struct l_esr_context); esr->esr = tf->tf_esr; } memcpy(&frame->sf.sf_uc.uc_stack, &uc_stack, sizeof(uc_stack)); memcpy(&frame->uc, &uc, sizeof(uc)); /* Copy the sigframe out to the user's stack. */ if (copyout(frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ free(frame, M_LINUX); CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } free(frame, M_LINUX); tf->tf_x[0]= sig; if (issiginfo) { tf->tf_x[1] = (register_t)&fp->sf.sf_si; tf->tf_x[2] = (register_t)&fp->sf.sf_uc; } else { tf->tf_x[1] = 0; tf->tf_x[2] = 0; } tf->tf_x[29] = (register_t)&fp->fp; tf->tf_elr = (register_t)catcher; tf->tf_sp = (register_t)fp; tf->tf_lr = (register_t)__user_rt_sigreturn; CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_fixup = __elfN(freebsd_fixup), .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, - .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_auxargs = __linuxN(copyout_auxargs), .sv_copyout_strings = __linuxN(copyout_strings), .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = linux_syscallnames, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = &elf_hwcap, .sv_hwcap2 = &elf_hwcap2, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error; error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); if (error == 0) - linux_on_exec(p, imgp); + error = linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset; } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux_vdso_so_o_start; char *vdso_end = &_binary_linux_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); linux_vdso_base = LINUX_VDSOPAGE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { Elf_Size rtype, symidx; const Elf_Rela *rela; const Elf_Shdr *shdr; const Elf_Ehdr *ehdr; Elf_Addr *where; Elf_Addr addr, addend; int i, relacnt; MPASS(offset != 0); relacnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: printf("Linux Aarch64 vDSO: unexpected Rel section\n"); break; case SHT_RELA: rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); relacnt = shdr[i].sh_size / sizeof(*rela); } } for (i = 0; i < relacnt; i++, rela++) { where = (Elf_Addr *)(mapping + rela->r_offset); addend = rela->r_addend; rtype = ELF_R_TYPE(rela->r_info); symidx = ELF_R_SYM(rela->r_info); switch (rtype) { case R_AARCH64_NONE: /* none */ break; case R_AARCH64_RELATIVE: /* B + A */ addr = (Elf_Addr)(mapping + addend); if (*where != addr) *where = addr; break; default: printf("Linux Aarch64 vDSO: unexpected relocation type %ld, " "symbol index %ld\n", rtype, symidx); } } } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; struct linux_ioctl_handler**lihp; int error; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux arm64 ELF exec handler installed\n"); } break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux arm64 ELF exec handler removed\n"); } else printf("Could not deinstall Linux arm64 ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "AArch64 Linux 64bit support"); diff --git a/sys/compat/linux/linux_emul.c b/sys/compat/linux/linux_emul.c index ab877116b9fc..6a2ea1f26fe5 100644 --- a/sys/compat/linux/linux_emul.c +++ b/sys/compat/linux/linux_emul.c @@ -1,381 +1,356 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1994-1996 Søren Schmidt * Copyright (c) 2006 Roman Divacky * All rights reserved. * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * This returns reference to the thread emuldata entry (if found) * * Hold PROC_LOCK when referencing emuldata from other threads. */ struct linux_emuldata * em_find(struct thread *td) { struct linux_emuldata *em; em = td->td_emuldata; return (em); } /* * This returns reference to the proc pemuldata entry (if found) * * Hold PROC_LOCK when referencing proc pemuldata from other threads. * Hold LINUX_PEM_LOCK wher referencing pemuldata members. */ struct linux_pemuldata * pem_find(struct proc *p) { struct linux_pemuldata *pem; pem = p->p_emuldata; return (pem); } /* * Linux apps generally expect the soft open file limit to be set * to 1024, often iterating over all the file descriptors up to that * limit instead of using closefrom(2). Give them what they want, * unless there already is a resource limit in place. */ static void linux_set_default_openfiles(struct thread *td, struct proc *p) { struct rlimit rlim; int error __diagused; if (linux_default_openfiles < 0) return; PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_NOFILE, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur != rlim.rlim_max || rlim.rlim_cur <= linux_default_openfiles) return; rlim.rlim_cur = linux_default_openfiles; error = kern_proc_setrlimit(td, p, RLIMIT_NOFILE, &rlim); KASSERT(error == 0, ("kern_proc_setrlimit failed")); } /* * The default stack size limit in Linux is 8MB. */ static void linux_set_default_stacksize(struct thread *td, struct proc *p) { struct rlimit rlim; int error __diagused; if (linux_default_stacksize < 0) return; PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur != rlim.rlim_max || rlim.rlim_cur <= linux_default_stacksize) return; rlim.rlim_cur = linux_default_stacksize; error = kern_proc_setrlimit(td, p, RLIMIT_STACK, &rlim); KASSERT(error == 0, ("kern_proc_setrlimit failed")); } void linux_proc_init(struct thread *td, struct thread *newtd, bool init_thread) { struct linux_emuldata *em; struct linux_pemuldata *pem; struct proc *p; if (newtd != NULL) { p = newtd->td_proc; /* non-exec call */ em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO); if (init_thread) { LINUX_CTR1(proc_init, "thread newtd(%d)", newtd->td_tid); em->em_tid = newtd->td_tid; } else { LINUX_CTR1(proc_init, "fork newtd(%d)", p->p_pid); em->em_tid = p->p_pid; pem = malloc(sizeof(*pem), M_LINUX, M_WAITOK | M_ZERO); sx_init(&pem->pem_sx, "lpemlk"); p->p_emuldata = pem; } newtd->td_emuldata = em; linux_set_default_openfiles(td, p); linux_set_default_stacksize(td, p); } else { p = td->td_proc; /* exec */ LINUX_CTR1(proc_init, "exec newtd(%d)", p->p_pid); /* lookup the old one */ em = em_find(td); KASSERT(em != NULL, ("proc_init: thread emuldata not found.\n")); em->em_tid = p->p_pid; em->flags = 0; em->robust_futexes = NULL; em->child_clear_tid = NULL; em->child_set_tid = NULL; pem = pem_find(p); KASSERT(pem != NULL, ("proc_init: proc emuldata not found.\n")); pem->persona = 0; pem->oom_score_adj = 0; } } void linux_on_exit(struct proc *p) { struct linux_pemuldata *pem; struct thread *td = curthread; MPASS(SV_CURPROC_ABI() == SV_ABI_LINUX); LINUX_CTR3(proc_exit, "thread(%d) proc(%d) p %p", td->td_tid, p->p_pid, p); pem = pem_find(p); if (pem == NULL) return; (p->p_sysent->sv_thread_detach)(td); p->p_emuldata = NULL; sx_destroy(&pem->pem_sx); free(pem, M_LINUX); } -/* - * If a Linux binary is exec'ing something, try this image activator - * first. We override standard shell script execution in order to - * be able to modify the interpreter path. We only do this if a Linux - * binary is doing the exec, so we do not create an EXEC module for it. - */ -int -linux_exec_imgact_try(struct image_params *imgp) -{ - const char *head = (const char *)imgp->image_header; - char *rpath; - int error = -1; - - /* - * The interpreter for shell scripts run from a Linux binary needs - * to be located in /compat/linux if possible in order to recursively - * maintain Linux path emulation. - */ - if (((const short *)head)[0] == SHELLMAGIC) { - /* - * Run our normal shell image activator. If it succeeds attempt - * to use the alternate path for the interpreter. If an - * alternate path is found, use our stringspace to store it. - */ - if ((error = exec_shell_imgact(imgp)) == 0) { - linux_emul_convpath(imgp->interpreter_name, UIO_SYSSPACE, - &rpath, 0, AT_FDCWD); - if (rpath != NULL) - imgp->args->fname_buf = - imgp->interpreter_name = rpath; - } - } - return (error); -} - int linux_common_execve(struct thread *td, struct image_args *eargs) { struct linux_pemuldata *pem; struct vmspace *oldvmspace; struct linux_emuldata *em; struct proc *p; int error; p = td->td_proc; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = kern_execve(td, eargs, NULL, oldvmspace); post_execve(td, error, oldvmspace); if (error != EJUSTRETURN) return (error); /* * In a case of transition from Linux binary execing to * FreeBSD binary we destroy Linux emuldata thread & proc entries. */ if (SV_CURPROC_ABI() != SV_ABI_LINUX) { + + /* Clear ABI root directory if set. */ + linux_pwd_onexec_native(td); + PROC_LOCK(p); em = em_find(td); KASSERT(em != NULL, ("proc_exec: thread emuldata not found.\n")); td->td_emuldata = NULL; pem = pem_find(p); KASSERT(pem != NULL, ("proc_exec: proc pemuldata not found.\n")); p->p_emuldata = NULL; PROC_UNLOCK(p); free(em, M_TEMP); free(pem, M_LINUX); } return (EJUSTRETURN); } -void +int linux_on_exec(struct proc *p, struct image_params *imgp) { struct thread *td; struct thread *othertd; #if defined(__amd64__) struct linux_pemuldata *pem; #endif + int error; td = curthread; MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX); /* * When execing to Linux binary, we create Linux emuldata * thread entry. */ if (SV_PROC_ABI(p) == SV_ABI_LINUX) { /* * Process already was under Linuxolator * before exec. Update emuldata to reflect * single-threaded cleaned state after exec. */ linux_proc_init(td, NULL, false); } else { /* * We are switching the process to Linux emulator. */ linux_proc_init(td, td, false); /* * Create a transient td_emuldata for all suspended * threads, so that p->p_sysent->sv_thread_detach() == * linux_thread_detach() can find expected but unused * emuldata. */ FOREACH_THREAD_IN_PROC(td->td_proc, othertd) { if (othertd == td) continue; linux_proc_init(td, othertd, true); } + + /* Set ABI root directory. */ + if ((error = linux_pwd_onexec(td)) != 0) + return (error); } #if defined(__amd64__) /* * An IA32 executable which has executable stack will have the * READ_IMPLIES_EXEC personality flag set automatically. */ if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && imgp->stack_prot & VM_PROT_EXECUTE) { pem = pem_find(p); pem->persona |= LINUX_READ_IMPLIES_EXEC; } #endif + return (0); } void linux_thread_dtor(struct thread *td) { struct linux_emuldata *em; em = em_find(td); if (em == NULL) return; td->td_emuldata = NULL; LINUX_CTR1(thread_dtor, "thread(%d)", em->em_tid); free(em, M_TEMP); } void linux_schedtail(struct thread *td) { struct linux_emuldata *em; #ifdef KTR int error; #else int error __unused; #endif int *child_set_tid; em = em_find(td); KASSERT(em != NULL, ("linux_schedtail: thread emuldata not found.\n")); child_set_tid = em->child_set_tid; if (child_set_tid != NULL) { error = copyout(&em->em_tid, child_set_tid, sizeof(em->em_tid)); LINUX_CTR4(schedtail, "thread(%d) %p stored %d error %d", td->td_tid, child_set_tid, em->em_tid, error); } else LINUX_CTR1(schedtail, "thread(%d)", em->em_tid); } diff --git a/sys/compat/linux/linux_emul.h b/sys/compat/linux/linux_emul.h index b4234b0f8cbc..0fde8a1abe69 100644 --- a/sys/compat/linux/linux_emul.h +++ b/sys/compat/linux/linux_emul.h @@ -1,84 +1,83 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006 Roman Divacky * All rights reserved. * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_EMUL_H_ #define _LINUX_EMUL_H_ struct image_params; /* * modeled after similar structure in NetBSD * this will be extended as we need more functionality */ struct linux_emuldata { int *child_set_tid; /* in clone(): Child's TID to set on clone */ int *child_clear_tid;/* in clone(): Child's TID to clear on exit */ int flags; /* thread emuldata flags */ int em_tid; /* thread id */ struct linux_robust_list_head *robust_futexes; }; struct linux_emuldata *em_find(struct thread *); -int linux_exec_imgact_try(struct image_params *); void linux_proc_init(struct thread *, struct thread *, bool); void linux_on_exit(struct proc *); void linux_schedtail(struct thread *); -void linux_on_exec(struct proc *, struct image_params *); +int linux_on_exec(struct proc *, struct image_params *); void linux_thread_dtor(struct thread *); int linux_common_execve(struct thread *, struct image_args *); /* process emuldata flags */ #define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated futex REQUEUE op*/ #define LINUX_XUNSUP_EPOLL 0x00000002 /* unsupported epoll events */ #define LINUX_XUNSUP_FUTEXPIOP 0x00000004 /* uses unsupported pi futex */ struct linux_pemuldata { uint32_t flags; /* process emuldata flags */ struct sx pem_sx; /* lock for this struct */ uint32_t persona; /* process execution domain */ uint32_t ptrace_flags; /* used by ptrace(2) */ uint32_t oom_score_adj; /* /proc/self/oom_score_adj */ uint32_t so_timestamp; /* requested timeval */ uint32_t so_timestampns; /* requested timespec */ }; #define LINUX_PEM_XLOCK(p) sx_xlock(&(p)->pem_sx) #define LINUX_PEM_XUNLOCK(p) sx_xunlock(&(p)->pem_sx) #define LINUX_PEM_SLOCK(p) sx_slock(&(p)->pem_sx) #define LINUX_PEM_SUNLOCK(p) sx_sunlock(&(p)->pem_sx) struct linux_pemuldata *pem_find(struct proc *); #endif /* !_LINUX_EMUL_H_ */ diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c index 3dfea8946e33..4e8c3f435261 100644 --- a/sys/compat/linux/linux_file.c +++ b/sys/compat/linux/linux_file.c @@ -1,2111 +1,1834 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #include #else #include #include #endif #include #include #include static int linux_common_open(struct thread *, int, const char *, int, int, enum uio_seg); static int linux_do_accessat(struct thread *t, int, const char *, int, int); static int linux_getdents_error(struct thread *, int, int); static struct bsd_to_linux_bitmap seal_bitmap[] = { BITMAP_1t1_LINUX(F_SEAL_SEAL), BITMAP_1t1_LINUX(F_SEAL_SHRINK), BITMAP_1t1_LINUX(F_SEAL_GROW), BITMAP_1t1_LINUX(F_SEAL_WRITE), }; #define MFD_HUGETLB_ENTRY(_size) \ { \ .bsd_value = MFD_HUGE_##_size, \ .linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size \ } static struct bsd_to_linux_bitmap mfd_bitmap[] = { BITMAP_1t1_LINUX(MFD_CLOEXEC), BITMAP_1t1_LINUX(MFD_ALLOW_SEALING), BITMAP_1t1_LINUX(MFD_HUGETLB), MFD_HUGETLB_ENTRY(64KB), MFD_HUGETLB_ENTRY(512KB), MFD_HUGETLB_ENTRY(1MB), MFD_HUGETLB_ENTRY(2MB), MFD_HUGETLB_ENTRY(8MB), MFD_HUGETLB_ENTRY(16MB), MFD_HUGETLB_ENTRY(32MB), MFD_HUGETLB_ENTRY(256MB), MFD_HUGETLB_ENTRY(512MB), MFD_HUGETLB_ENTRY(1GB), MFD_HUGETLB_ENTRY(2GB), MFD_HUGETLB_ENTRY(16GB), }; #undef MFD_HUGETLB_ENTRY #ifdef LINUX_LEGACY_SYSCALLS int linux_creat(struct thread *td, struct linux_creat_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE, - O_WRONLY | O_CREAT | O_TRUNC, args->mode)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE, - O_WRONLY | O_CREAT | O_TRUNC, args->mode); - LFREEPATH(path); - return (error); + return (kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE, + O_WRONLY | O_CREAT | O_TRUNC, args->mode)); } #endif static int linux_common_openflags(int l_flags) { int bsd_flags; bsd_flags = 0; switch (l_flags & LINUX_O_ACCMODE) { case LINUX_O_WRONLY: bsd_flags |= O_WRONLY; break; case LINUX_O_RDWR: bsd_flags |= O_RDWR; break; default: bsd_flags |= O_RDONLY; } if (l_flags & LINUX_O_NDELAY) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_O_APPEND) bsd_flags |= O_APPEND; if (l_flags & LINUX_O_SYNC) bsd_flags |= O_FSYNC; if (l_flags & LINUX_O_CLOEXEC) bsd_flags |= O_CLOEXEC; if (l_flags & LINUX_O_NONBLOCK) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_O_ASYNC) bsd_flags |= O_ASYNC; if (l_flags & LINUX_O_CREAT) bsd_flags |= O_CREAT; if (l_flags & LINUX_O_TRUNC) bsd_flags |= O_TRUNC; if (l_flags & LINUX_O_EXCL) bsd_flags |= O_EXCL; if (l_flags & LINUX_O_NOCTTY) bsd_flags |= O_NOCTTY; if (l_flags & LINUX_O_DIRECT) bsd_flags |= O_DIRECT; if (l_flags & LINUX_O_NOFOLLOW) bsd_flags |= O_NOFOLLOW; if (l_flags & LINUX_O_DIRECTORY) bsd_flags |= O_DIRECTORY; if (l_flags & LINUX_O_PATH) bsd_flags |= O_PATH; /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ return (bsd_flags); } static int linux_common_open(struct thread *td, int dirfd, const char *path, int l_flags, int mode, enum uio_seg seg) { struct proc *p = td->td_proc; struct file *fp; int fd; int bsd_flags, error; bsd_flags = linux_common_openflags(l_flags); error = kern_openat(td, dirfd, path, seg, bsd_flags, mode); if (error != 0) { if (error == EMLINK) error = ELOOP; goto done; } if (p->p_flag & P_CONTROLT) goto done; if (bsd_flags & O_NOCTTY) goto done; /* * XXX In between kern_openat() and fget(), another process * having the same filedesc could use that fd without * checking below. */ fd = td->td_retval[0]; if (fget(td, fd, &cap_ioctl_rights, &fp) == 0) { if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); goto done; } sx_slock(&proctree_lock); PROC_LOCK(p); if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* XXXPJD: Verify if TIOCSCTTY is allowed. */ (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } fdrop(fp, td); } done: return (error); } int linux_openat(struct thread *td, struct linux_openat_args *args) { - char *path; - int dfd, error; + int dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (linux_common_open(td, dfd, args->filename, args->flags, - args->mode, UIO_USERSPACE)); - } - if (args->flags & LINUX_O_CREAT) - LCONVPATH_AT(args->filename, &path, 1, dfd); - else - LCONVPATH_AT(args->filename, &path, 0, dfd); - - error = linux_common_open(td, dfd, path, args->flags, args->mode, - UIO_SYSSPACE); - LFREEPATH(path); - return (error); + return (linux_common_open(td, dfd, args->filename, args->flags, + args->mode, UIO_USERSPACE)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_open(struct thread *td, struct linux_open_args *args) { - char *path; - int error; - - if (!LUSECONVPATH(td)) { - return (linux_common_open(td, AT_FDCWD, args->path, args->flags, - args->mode, UIO_USERSPACE)); - } - if (args->flags & LINUX_O_CREAT) - LCONVPATHCREAT(args->path, &path); - else - LCONVPATHEXIST(args->path, &path); - error = linux_common_open(td, AT_FDCWD, path, args->flags, args->mode, - UIO_SYSSPACE); - LFREEPATH(path); - return (error); + return (linux_common_open(td, AT_FDCWD, args->path, args->flags, + args->mode, UIO_USERSPACE)); } #endif int linux_name_to_handle_at(struct thread *td, struct linux_name_to_handle_at_args *args) { static const l_int valid_flags = (LINUX_AT_SYMLINK_FOLLOW | LINUX_AT_EMPTY_PATH); static const l_uint fh_size = sizeof(fhandle_t); fhandle_t fh; l_uint fh_bytes; l_int mount_id; int error, fd, bsd_flags; if (args->flags & ~valid_flags) return (EINVAL); fd = args->dirfd; if (fd == LINUX_AT_FDCWD) fd = AT_FDCWD; bsd_flags = 0; if (!(args->flags & LINUX_AT_SYMLINK_FOLLOW)) bsd_flags |= AT_SYMLINK_NOFOLLOW; if ((args->flags & LINUX_AT_EMPTY_PATH) != 0) bsd_flags |= AT_EMPTY_PATH; - if (!LUSECONVPATH(td)) { - error = kern_getfhat(td, bsd_flags, fd, args->name, - UIO_USERSPACE, &fh, UIO_SYSSPACE); - } else { - char *path; - - LCONVPATH_AT(args->name, &path, 0, fd); - error = kern_getfhat(td, bsd_flags, fd, path, UIO_SYSSPACE, - &fh, UIO_SYSSPACE); - LFREEPATH(path); - } + error = kern_getfhat(td, bsd_flags, fd, args->name, + UIO_USERSPACE, &fh, UIO_SYSSPACE); if (error != 0) return (error); /* Emit mount_id -- required before EOVERFLOW case. */ mount_id = (fh.fh_fsid.val[0] ^ fh.fh_fsid.val[1]); error = copyout(&mount_id, args->mnt_id, sizeof(mount_id)); if (error != 0) return (error); /* Check if there is room for handle. */ error = copyin(&args->handle->handle_bytes, &fh_bytes, sizeof(fh_bytes)); if (error != 0) return (error); if (fh_bytes < fh_size) { error = copyout(&fh_size, &args->handle->handle_bytes, sizeof(fh_size)); if (error == 0) error = EOVERFLOW; return (error); } /* Emit handle. */ mount_id = 0; /* * We don't use handle_type for anything yet, but initialize a known * value. */ error = copyout(&mount_id, &args->handle->handle_type, sizeof(mount_id)); if (error != 0) return (error); error = copyout(&fh, &args->handle->f_handle, sizeof(fh)); return (error); } int linux_open_by_handle_at(struct thread *td, struct linux_open_by_handle_at_args *args) { l_uint fh_bytes; int bsd_flags, error; error = copyin(&args->handle->handle_bytes, &fh_bytes, sizeof(fh_bytes)); if (error != 0) return (error); if (fh_bytes < sizeof(fhandle_t)) return (EINVAL); bsd_flags = linux_common_openflags(args->flags); return (kern_fhopen(td, (void *)&args->handle->f_handle, bsd_flags)); } int linux_lseek(struct thread *td, struct linux_lseek_args *args) { return (kern_lseek(td, args->fdes, args->off, args->whence)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_llseek(struct thread *td, struct linux_llseek_args *args) { int error; off_t off; off = (args->olow) | (((off_t) args->ohigh) << 32); error = kern_lseek(td, args->fd, off, args->whence); if (error != 0) return (error); error = copyout(td->td_retval, args->res, sizeof(off_t)); if (error != 0) return (error); td->td_retval[0] = 0; return (0); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ /* * Note that linux_getdents(2) and linux_getdents64(2) have the same * arguments. They only differ in the definition of struct dirent they * operate on. * Note that linux_readdir(2) is a special case of linux_getdents(2) * where count is always equals 1, meaning that the buffer is one * dirent-structure in size and that the code can't handle more anyway. * Note that linux_readdir(2) can't be implemented by means of linux_getdents(2) * as in case when the *dent buffer size is equal to 1 linux_getdents(2) will * trash user stack. */ static int linux_getdents_error(struct thread *td, int fd, int err) { struct vnode *vp; struct file *fp; int error; /* Linux return ENOTDIR in case when fd is not a directory. */ error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; if (vp->v_type != VDIR) { fdrop(fp, td); return (ENOTDIR); } fdrop(fp, td); return (err); } struct l_dirent { l_ulong d_ino; l_off_t d_off; l_ushort d_reclen; char d_name[LINUX_NAME_MAX + 1]; }; struct l_dirent64 { uint64_t d_ino; int64_t d_off; l_ushort d_reclen; u_char d_type; char d_name[LINUX_NAME_MAX + 1]; }; /* * Linux uses the last byte in the dirent buffer to store d_type, * at least glibc-2.7 requires it. That is why l_dirent is padded with 2 bytes. */ #define LINUX_RECLEN(namlen) \ roundup(offsetof(struct l_dirent, d_name) + (namlen) + 2, sizeof(l_ulong)) #define LINUX_RECLEN64(namlen) \ roundup(offsetof(struct l_dirent64, d_name) + (namlen) + 1, \ sizeof(uint64_t)) #ifdef LINUX_LEGACY_SYSCALLS int linux_getdents(struct thread *td, struct linux_getdents_args *args) { struct dirent *bdp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent *linux_dirent; int buflen, error; size_t retval; buflen = min(args->count, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out1; } lbuf = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); len = td->td_retval[0]; inp = buf; outp = (caddr_t)args->dent; resid = args->count; retval = 0; while (len > 0) { bdp = (struct dirent *) inp; reclen = bdp->d_reclen; linuxreclen = LINUX_RECLEN(bdp->d_namlen); /* * No more space in the user supplied dirent buffer. * Return EINVAL. */ if (resid < linuxreclen) { error = EINVAL; goto out; } linux_dirent = (struct l_dirent*)lbuf; linux_dirent->d_ino = bdp->d_fileno; linux_dirent->d_off = bdp->d_off; linux_dirent->d_reclen = linuxreclen; /* * Copy d_type to last byte of l_dirent buffer */ lbuf[linuxreclen - 1] = bdp->d_type; strlcpy(linux_dirent->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent, d_name)-1); error = copyout(linux_dirent, outp, linuxreclen); if (error != 0) goto out; inp += reclen; base += reclen; len -= reclen; retval += linuxreclen; outp += linuxreclen; resid -= linuxreclen; } td->td_retval[0] = retval; out: free(lbuf, M_TEMP); out1: free(buf, M_TEMP); return (error); } #endif int linux_getdents64(struct thread *td, struct linux_getdents64_args *args) { struct dirent *bdp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent64 *linux_dirent64; int buflen, error; size_t retval; buflen = min(args->count, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out1; } lbuf = malloc(LINUX_RECLEN64(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); len = td->td_retval[0]; inp = buf; outp = (caddr_t)args->dirent; resid = args->count; retval = 0; while (len > 0) { bdp = (struct dirent *) inp; reclen = bdp->d_reclen; linuxreclen = LINUX_RECLEN64(bdp->d_namlen); /* * No more space in the user supplied dirent buffer. * Return EINVAL. */ if (resid < linuxreclen) { error = EINVAL; goto out; } linux_dirent64 = (struct l_dirent64*)lbuf; linux_dirent64->d_ino = bdp->d_fileno; linux_dirent64->d_off = bdp->d_off; linux_dirent64->d_reclen = linuxreclen; linux_dirent64->d_type = bdp->d_type; strlcpy(linux_dirent64->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent64, d_name)); error = copyout(linux_dirent64, outp, linuxreclen); if (error != 0) goto out; inp += reclen; base += reclen; len -= reclen; retval += linuxreclen; outp += linuxreclen; resid -= linuxreclen; } td->td_retval[0] = retval; out: free(lbuf, M_TEMP); out1: free(buf, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_readdir(struct thread *td, struct linux_readdir_args *args) { struct dirent *bdp; caddr_t buf; /* BSD-format */ int linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent *linux_dirent; int buflen, error; buflen = LINUX_RECLEN(LINUX_NAME_MAX); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out; } if (td->td_retval[0] == 0) goto out; lbuf = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); bdp = (struct dirent *) buf; linuxreclen = LINUX_RECLEN(bdp->d_namlen); linux_dirent = (struct l_dirent*)lbuf; linux_dirent->d_ino = bdp->d_fileno; linux_dirent->d_off = bdp->d_off; linux_dirent->d_reclen = bdp->d_namlen; strlcpy(linux_dirent->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent, d_name)); error = copyout(linux_dirent, args->dent, linuxreclen); if (error == 0) td->td_retval[0] = linuxreclen; free(lbuf, M_TEMP); out: free(buf, M_TEMP); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ /* * These exist mainly for hooks for doing /compat/linux translation. */ #ifdef LINUX_LEGACY_SYSCALLS int linux_access(struct thread *td, struct linux_access_args *args) { - char *path; - int error; /* Linux convention. */ if (args->amode & ~(F_OK | X_OK | W_OK | R_OK)) return (EINVAL); - if (!LUSECONVPATH(td)) { - error = kern_accessat(td, AT_FDCWD, args->path, UIO_USERSPACE, 0, - args->amode); - } else { - LCONVPATHEXIST(args->path, &path); - error = kern_accessat(td, AT_FDCWD, path, UIO_SYSSPACE, 0, - args->amode); - LFREEPATH(path); - } - - return (error); + return (kern_accessat(td, AT_FDCWD, args->path, UIO_USERSPACE, 0, + args->amode)); } #endif static int linux_do_accessat(struct thread *td, int ldfd, const char *filename, int amode, int flags) { - char *path; - int error, dfd; + int dfd; /* Linux convention. */ if (amode & ~(F_OK | X_OK | W_OK | R_OK)) return (EINVAL); dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd; - if (!LUSECONVPATH(td)) { - error = kern_accessat(td, dfd, filename, UIO_USERSPACE, flags, amode); - } else { - LCONVPATHEXIST_AT(filename, &path, dfd); - error = kern_accessat(td, dfd, path, UIO_SYSSPACE, flags, amode); - LFREEPATH(path); - } - - return (error); + return (kern_accessat(td, dfd, filename, UIO_USERSPACE, flags, amode)); } int linux_faccessat(struct thread *td, struct linux_faccessat_args *args) { return (linux_do_accessat(td, args->dfd, args->filename, args->amode, 0)); } int linux_faccessat2(struct thread *td, struct linux_faccessat2_args *args) { int flags, unsupported; /* XXX. AT_SYMLINK_NOFOLLOW is not supported by kern_accessat */ unsupported = args->flags & ~(LINUX_AT_EACCESS | LINUX_AT_EMPTY_PATH); if (unsupported != 0) { linux_msg(td, "faccessat2 unsupported flag 0x%x", unsupported); return (EINVAL); } flags = (args->flags & LINUX_AT_EACCESS) == 0 ? 0 : AT_EACCESS; flags |= (args->flags & LINUX_AT_EMPTY_PATH) == 0 ? 0 : AT_EMPTY_PATH; return (linux_do_accessat(td, args->dfd, args->filename, args->amode, flags)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_unlink(struct thread *td, struct linux_unlink_args *args) { - char *path; int error; struct stat st; - if (!LUSECONVPATH(td)) { - error = kern_funlinkat(td, AT_FDCWD, args->path, FD_NONE, - UIO_USERSPACE, 0, 0); - if (error == EPERM) { - /* Introduce POSIX noncompliant behaviour of Linux */ - if (kern_statat(td, 0, AT_FDCWD, args->path, - UIO_USERSPACE, &st) == 0) { - if (S_ISDIR(st.st_mode)) - error = EISDIR; - } - } - } else { - LCONVPATHEXIST(args->path, &path); - error = kern_funlinkat(td, AT_FDCWD, path, FD_NONE, UIO_SYSSPACE, 0, 0); - if (error == EPERM) { - /* Introduce POSIX noncompliant behaviour of Linux */ - if (kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, - &st) == 0) { - if (S_ISDIR(st.st_mode)) - error = EISDIR; - } + error = kern_funlinkat(td, AT_FDCWD, args->path, FD_NONE, + UIO_USERSPACE, 0, 0); + if (error == EPERM) { + /* Introduce POSIX noncompliant behaviour of Linux */ + if (kern_statat(td, 0, AT_FDCWD, args->path, + UIO_USERSPACE, &st) == 0) { + if (S_ISDIR(st.st_mode)) + error = EISDIR; } - LFREEPATH(path); } return (error); } #endif static int linux_unlinkat_impl(struct thread *td, enum uio_seg pathseg, const char *path, int dfd, struct linux_unlinkat_args *args) { struct stat st; int error; if (args->flag & LINUX_AT_REMOVEDIR) error = kern_frmdirat(td, dfd, path, FD_NONE, pathseg, 0); else error = kern_funlinkat(td, dfd, path, FD_NONE, pathseg, 0, 0); if (error == EPERM && !(args->flag & LINUX_AT_REMOVEDIR)) { /* Introduce POSIX noncompliant behaviour of Linux */ if (kern_statat(td, AT_SYMLINK_NOFOLLOW, dfd, path, pathseg, &st) == 0 && S_ISDIR(st.st_mode)) error = EISDIR; } return (error); } int linux_unlinkat(struct thread *td, struct linux_unlinkat_args *args) { - char *path; - int error, dfd; + int dfd; if (args->flag & ~LINUX_AT_REMOVEDIR) return (EINVAL); dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (linux_unlinkat_impl(td, UIO_USERSPACE, args->pathname, - dfd, args)); - } - LCONVPATHEXIST_AT(args->pathname, &path, dfd); - error = linux_unlinkat_impl(td, UIO_SYSSPACE, path, dfd, args); - LFREEPATH(path); - return (error); + return (linux_unlinkat_impl(td, UIO_USERSPACE, args->pathname, + dfd, args)); } + int linux_chdir(struct thread *td, struct linux_chdir_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_chdir(td, args->path, UIO_USERSPACE)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_chdir(td, path, UIO_SYSSPACE); - LFREEPATH(path); - return (error); + return (kern_chdir(td, args->path, UIO_USERSPACE)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_chmod(struct thread *td, struct linux_chmod_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_fchmodat(td, AT_FDCWD, args->path, UIO_USERSPACE, - args->mode, 0)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_fchmodat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode, 0); - LFREEPATH(path); - return (error); + return (kern_fchmodat(td, AT_FDCWD, args->path, UIO_USERSPACE, + args->mode, 0)); } #endif int linux_fchmodat(struct thread *td, struct linux_fchmodat_args *args) { - char *path; - int error, dfd; + int dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (kern_fchmodat(td, dfd, args->filename, UIO_USERSPACE, - args->mode, 0)); - } - LCONVPATHEXIST_AT(args->filename, &path, dfd); - error = kern_fchmodat(td, dfd, path, UIO_SYSSPACE, args->mode, 0); - LFREEPATH(path); - return (error); + return (kern_fchmodat(td, dfd, args->filename, UIO_USERSPACE, + args->mode, 0)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_mkdir(struct thread *td, struct linux_mkdir_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_mkdirat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->mode)); - } - LCONVPATHCREAT(args->path, &path); - error = kern_mkdirat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode); - LFREEPATH(path); - return (error); + return (kern_mkdirat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->mode)); } #endif int linux_mkdirat(struct thread *td, struct linux_mkdirat_args *args) { - char *path; - int error, dfd; + int dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (kern_mkdirat(td, dfd, args->pathname, UIO_USERSPACE, args->mode)); - } - LCONVPATHCREAT_AT(args->pathname, &path, dfd); - error = kern_mkdirat(td, dfd, path, UIO_SYSSPACE, args->mode); - LFREEPATH(path); - return (error); + return (kern_mkdirat(td, dfd, args->pathname, UIO_USERSPACE, args->mode)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_rmdir(struct thread *td, struct linux_rmdir_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_frmdirat(td, AT_FDCWD, args->path, FD_NONE, - UIO_USERSPACE, 0)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_frmdirat(td, AT_FDCWD, path, FD_NONE, UIO_SYSSPACE, 0); - LFREEPATH(path); - return (error); + return (kern_frmdirat(td, AT_FDCWD, args->path, FD_NONE, + UIO_USERSPACE, 0)); } int linux_rename(struct thread *td, struct linux_rename_args *args) { - char *from, *to; - int error; - if (!LUSECONVPATH(td)) { - return (kern_renameat(td, AT_FDCWD, args->from, AT_FDCWD, - args->to, UIO_USERSPACE)); - } - LCONVPATHEXIST(args->from, &from); - /* Expand LCONVPATHCREATE so that `from' can be freed on errors */ - error = linux_emul_convpath(args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); - if (to == NULL) { - LFREEPATH(from); - return (error); - } - error = kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, UIO_SYSSPACE); - LFREEPATH(from); - LFREEPATH(to); - return (error); + return (kern_renameat(td, AT_FDCWD, args->from, AT_FDCWD, + args->to, UIO_USERSPACE)); } #endif int linux_renameat(struct thread *td, struct linux_renameat_args *args) { struct linux_renameat2_args renameat2_args = { .olddfd = args->olddfd, .oldname = args->oldname, .newdfd = args->newdfd, .newname = args->newname, .flags = 0 }; return (linux_renameat2(td, &renameat2_args)); } int linux_renameat2(struct thread *td, struct linux_renameat2_args *args) { - char *from, *to; - int error, olddfd, newdfd; + int olddfd, newdfd; if (args->flags != 0) { if (args->flags & ~(LINUX_RENAME_EXCHANGE | LINUX_RENAME_NOREPLACE | LINUX_RENAME_WHITEOUT)) return (EINVAL); if (args->flags & LINUX_RENAME_EXCHANGE && args->flags & (LINUX_RENAME_NOREPLACE | LINUX_RENAME_WHITEOUT)) return (EINVAL); #if 0 /* * This spams the console on Ubuntu Focal. * * What's needed here is a general mechanism to let users know * about missing features without hogging the system. */ linux_msg(td, "renameat2 unsupported flags 0x%x", args->flags); #endif return (EINVAL); } olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd; newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; - if (!LUSECONVPATH(td)) { - return (kern_renameat(td, olddfd, args->oldname, newdfd, - args->newname, UIO_USERSPACE)); - } - LCONVPATHEXIST_AT(args->oldname, &from, olddfd); - /* Expand LCONVPATHCREATE so that `from' can be freed on errors */ - error = linux_emul_convpath(args->newname, UIO_USERSPACE, &to, 1, newdfd); - if (to == NULL) { - LFREEPATH(from); - return (error); - } - error = kern_renameat(td, olddfd, from, newdfd, to, UIO_SYSSPACE); - LFREEPATH(from); - LFREEPATH(to); - return (error); + return (kern_renameat(td, olddfd, args->oldname, newdfd, + args->newname, UIO_USERSPACE)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_symlink(struct thread *td, struct linux_symlink_args *args) { - char *path, *to; - int error; - if (!LUSECONVPATH(td)) { - return (kern_symlinkat(td, args->path, AT_FDCWD, args->to, - UIO_USERSPACE)); - } - LCONVPATHEXIST(args->path, &path); - /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ - error = linux_emul_convpath(args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); - if (to == NULL) { - LFREEPATH(path); - return (error); - } - error = kern_symlinkat(td, path, AT_FDCWD, to, UIO_SYSSPACE); - LFREEPATH(path); - LFREEPATH(to); - return (error); + return (kern_symlinkat(td, args->path, AT_FDCWD, args->to, + UIO_USERSPACE)); } #endif int linux_symlinkat(struct thread *td, struct linux_symlinkat_args *args) { - char *path, *to; - int error, dfd; + int dfd; dfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; - if (!LUSECONVPATH(td)) { - return (kern_symlinkat(td, args->oldname, dfd, args->newname, - UIO_USERSPACE)); - } - LCONVPATHEXIST(args->oldname, &path); - /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ - error = linux_emul_convpath(args->newname, UIO_USERSPACE, &to, 1, dfd); - if (to == NULL) { - LFREEPATH(path); - return (error); - } - error = kern_symlinkat(td, path, dfd, to, UIO_SYSSPACE); - LFREEPATH(path); - LFREEPATH(to); - return (error); + return (kern_symlinkat(td, args->oldname, dfd, args->newname, + UIO_USERSPACE)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_readlink(struct thread *td, struct linux_readlink_args *args) { - char *name; - int error; if (args->count <= 0) return (EINVAL); - if (!LUSECONVPATH(td)) { - return (kern_readlinkat(td, AT_FDCWD, args->name, UIO_USERSPACE, - args->buf, UIO_USERSPACE, args->count)); - } - LCONVPATHEXIST(args->name, &name); - error = kern_readlinkat(td, AT_FDCWD, name, UIO_SYSSPACE, - args->buf, UIO_USERSPACE, args->count); - LFREEPATH(name); - return (error); + return (kern_readlinkat(td, AT_FDCWD, args->name, UIO_USERSPACE, + args->buf, UIO_USERSPACE, args->count)); } #endif int linux_readlinkat(struct thread *td, struct linux_readlinkat_args *args) { - char *name; - int error, dfd; + int dfd; if (args->bufsiz <= 0) return (EINVAL); dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (kern_readlinkat(td, dfd, args->path, UIO_USERSPACE, - args->buf, UIO_USERSPACE, args->bufsiz)); - } - LCONVPATHEXIST_AT(args->path, &name, dfd); - error = kern_readlinkat(td, dfd, name, UIO_SYSSPACE, args->buf, - UIO_USERSPACE, args->bufsiz); - LFREEPATH(name); - return (error); + return (kern_readlinkat(td, dfd, args->path, UIO_USERSPACE, + args->buf, UIO_USERSPACE, args->bufsiz)); } int linux_truncate(struct thread *td, struct linux_truncate_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_truncate(td, args->path, UIO_USERSPACE, args->length)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_truncate(td, path, UIO_SYSSPACE, args->length); - LFREEPATH(path); - return (error); + return (kern_truncate(td, args->path, UIO_USERSPACE, args->length)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_truncate64(struct thread *td, struct linux_truncate64_args *args) { - char *path; off_t length; - int error; #if defined(__amd64__) && defined(COMPAT_LINUX32) length = PAIR32TO64(off_t, args->length); #else length = args->length; #endif - if (!LUSECONVPATH(td)) { - return (kern_truncate(td, args->path, UIO_USERSPACE, length)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_truncate(td, path, UIO_SYSSPACE, length); - LFREEPATH(path); - return (error); + return (kern_truncate(td, args->path, UIO_USERSPACE, length)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args) { return (kern_ftruncate(td, args->fd, args->length)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args) { off_t length; #if defined(__amd64__) && defined(COMPAT_LINUX32) length = PAIR32TO64(off_t, args->length); #else length = args->length; #endif return (kern_ftruncate(td, args->fd, length)); } #endif #ifdef LINUX_LEGACY_SYSCALLS int linux_link(struct thread *td, struct linux_link_args *args) { - char *path, *to; - int error; - if (!LUSECONVPATH(td)) { - return (kern_linkat(td, AT_FDCWD, AT_FDCWD, args->path, args->to, - UIO_USERSPACE, AT_SYMLINK_FOLLOW)); - } - LCONVPATHEXIST(args->path, &path); - /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ - error = linux_emul_convpath(args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); - if (to == NULL) { - LFREEPATH(path); - return (error); - } - error = kern_linkat(td, AT_FDCWD, AT_FDCWD, path, to, UIO_SYSSPACE, - AT_SYMLINK_FOLLOW); - LFREEPATH(path); - LFREEPATH(to); - return (error); + return (kern_linkat(td, AT_FDCWD, AT_FDCWD, args->path, args->to, + UIO_USERSPACE, AT_SYMLINK_FOLLOW)); } #endif int linux_linkat(struct thread *td, struct linux_linkat_args *args) { - char *path, *to; - int error, olddfd, newdfd, flag; + int olddfd, newdfd, flag; if (args->flag & ~(LINUX_AT_SYMLINK_FOLLOW | LINUX_AT_EMPTY_PATH)) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_FOLLOW) != 0 ? AT_SYMLINK_FOLLOW : 0; flag |= (args->flag & LINUX_AT_EMPTY_PATH) != 0 ? AT_EMPTY_PATH : 0; olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd; newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; - if (!LUSECONVPATH(td)) { - return (kern_linkat(td, olddfd, newdfd, args->oldname, - args->newname, UIO_USERSPACE, flag)); - } - LCONVPATHEXIST_AT(args->oldname, &path, olddfd); - /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ - error = linux_emul_convpath(args->newname, UIO_USERSPACE, &to, 1, newdfd); - if (to == NULL) { - LFREEPATH(path); - return (error); - } - error = kern_linkat(td, olddfd, newdfd, path, to, UIO_SYSSPACE, flag); - LFREEPATH(path); - LFREEPATH(to); - return (error); + return (kern_linkat(td, olddfd, newdfd, args->oldname, + args->newname, UIO_USERSPACE, flag)); } int linux_fdatasync(struct thread *td, struct linux_fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } int linux_sync_file_range(struct thread *td, struct linux_sync_file_range_args *uap) { off_t nbytes, offset; #if defined(__amd64__) && defined(COMPAT_LINUX32) nbytes = PAIR32TO64(off_t, uap->nbytes); offset = PAIR32TO64(off_t, uap->offset); #else nbytes = uap->nbytes; offset = uap->offset; #endif if (offset < 0 || nbytes < 0 || (uap->flags & ~(LINUX_SYNC_FILE_RANGE_WAIT_BEFORE | LINUX_SYNC_FILE_RANGE_WRITE | LINUX_SYNC_FILE_RANGE_WAIT_AFTER)) != 0) { return (EINVAL); } return (kern_fsync(td, uap->fd, false)); } int linux_pread(struct thread *td, struct linux_pread_args *uap) { struct vnode *vp; off_t offset; int error; #if defined(__amd64__) && defined(COMPAT_LINUX32) offset = PAIR32TO64(off_t, uap->offset); #else offset = uap->offset; #endif error = kern_pread(td, uap->fd, uap->buf, uap->nbyte, offset); if (error == 0) { /* This seems to violate POSIX but Linux does it. */ error = fgetvp(td, uap->fd, &cap_pread_rights, &vp); if (error != 0) return (error); if (vp->v_type == VDIR) error = EISDIR; vrele(vp); } return (error); } int linux_pwrite(struct thread *td, struct linux_pwrite_args *uap) { off_t offset; #if defined(__amd64__) && defined(COMPAT_LINUX32) offset = PAIR32TO64(off_t, uap->offset); #else offset = uap->offset; #endif return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, offset)); } #define HALF_LONG_BITS ((sizeof(l_long) * NBBY / 2)) static inline off_t pos_from_hilo(unsigned long high, unsigned long low) { return (((off_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } int linux_preadv(struct thread *td, struct linux_preadv_args *uap) { struct uio *auio; int error; off_t offset; /* * According http://man7.org/linux/man-pages/man2/preadv.2.html#NOTES * pos_l and pos_h, respectively, contain the * low order and high order 32 bits of offset. */ offset = pos_from_hilo(uap->pos_h, uap->pos_l); if (offset < 0) return (EINVAL); #ifdef COMPAT_LINUX32 error = linux32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio); #else error = copyinuio(uap->vec, uap->vlen, &auio); #endif if (error != 0) return (error); error = kern_preadv(td, uap->fd, auio, offset); free(auio, M_IOV); return (error); } int linux_pwritev(struct thread *td, struct linux_pwritev_args *uap) { struct uio *auio; int error; off_t offset; /* * According http://man7.org/linux/man-pages/man2/pwritev.2.html#NOTES * pos_l and pos_h, respectively, contain the * low order and high order 32 bits of offset. */ offset = pos_from_hilo(uap->pos_h, uap->pos_l); if (offset < 0) return (EINVAL); #ifdef COMPAT_LINUX32 error = linux32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio); #else error = copyinuio(uap->vec, uap->vlen, &auio); #endif if (error != 0) return (error); error = kern_pwritev(td, uap->fd, auio, offset); free(auio, M_IOV); return (error); } int linux_mount(struct thread *td, struct linux_mount_args *args) { struct mntarg *ma = NULL; char *fstypename, *mntonname, *mntfromname, *data; int error, fsflags; fstypename = malloc(MNAMELEN, M_TEMP, M_WAITOK); mntonname = malloc(MNAMELEN, M_TEMP, M_WAITOK); mntfromname = malloc(MNAMELEN, M_TEMP, M_WAITOK); data = NULL; error = copyinstr(args->filesystemtype, fstypename, MNAMELEN - 1, NULL); if (error != 0) goto out; if (args->specialfile != NULL) { error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL); if (error != 0) goto out; } else { mntfromname[0] = '\0'; } error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL); if (error != 0) goto out; if (strcmp(fstypename, "ext2") == 0) { strcpy(fstypename, "ext2fs"); } else if (strcmp(fstypename, "proc") == 0) { strcpy(fstypename, "linprocfs"); } else if (strcmp(fstypename, "vfat") == 0) { strcpy(fstypename, "msdosfs"); } else if (strcmp(fstypename, "fuse") == 0 || strncmp(fstypename, "fuse.", 5) == 0) { char *fuse_options, *fuse_option, *fuse_name; strcpy(mntfromname, "/dev/fuse"); strcpy(fstypename, "fusefs"); data = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(args->data, data, MNAMELEN - 1, NULL); if (error != 0) goto out; fuse_options = data; while ((fuse_option = strsep(&fuse_options, ",")) != NULL) { fuse_name = strsep(&fuse_option, "="); if (fuse_name == NULL || fuse_option == NULL) goto out; ma = mount_arg(ma, fuse_name, fuse_option, -1); } /* * The FUSE server uses Linux errno values instead of FreeBSD * ones; add a flag to tell fuse(4) to do errno translation. */ ma = mount_arg(ma, "linux_errnos", "1", -1); } fsflags = 0; /* * Linux SYNC flag is not included; the closest equivalent * FreeBSD has is !ASYNC, which is our default. */ if (args->rwflag & LINUX_MS_RDONLY) fsflags |= MNT_RDONLY; if (args->rwflag & LINUX_MS_NOSUID) fsflags |= MNT_NOSUID; if (args->rwflag & LINUX_MS_NOEXEC) fsflags |= MNT_NOEXEC; if (args->rwflag & LINUX_MS_REMOUNT) fsflags |= MNT_UPDATE; ma = mount_arg(ma, "fstype", fstypename, -1); ma = mount_arg(ma, "fspath", mntonname, -1); ma = mount_arg(ma, "from", mntfromname, -1); error = kernel_mount(ma, fsflags); out: free(fstypename, M_TEMP); free(mntonname, M_TEMP); free(mntfromname, M_TEMP); free(data, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_oldumount(struct thread *td, struct linux_oldumount_args *args) { return (kern_unmount(td, args->path, 0)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ #ifdef LINUX_LEGACY_SYSCALLS int linux_umount(struct thread *td, struct linux_umount_args *args) { int flags; flags = 0; if ((args->flags & LINUX_MNT_FORCE) != 0) { args->flags &= ~LINUX_MNT_FORCE; flags |= MNT_FORCE; } if (args->flags != 0) { linux_msg(td, "unsupported umount2 flags %#x", args->flags); return (EINVAL); } return (kern_unmount(td, args->path, flags)); } #endif /* * fcntl family of syscalls */ struct l_flock { l_short l_type; l_short l_whence; l_off_t l_start; l_off_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; bsd_flock->l_sysid = 0; } static void bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_off_t)bsd_flock->l_start; linux_flock->l_len = (l_off_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct l_flock64 { l_short l_type; l_short l_whence; l_loff_t l_start; l_loff_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; bsd_flock->l_sysid = 0; } static void bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_loff_t)bsd_flock->l_start; linux_flock->l_len = (l_loff_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ static int fcntl_common(struct thread *td, struct linux_fcntl_args *args) { struct l_flock linux_flock; struct flock bsd_flock; struct pipe *fpipe; struct file *fp; long arg; int error, result; switch (args->cmd) { case LINUX_F_DUPFD: return (kern_fcntl(td, args->fd, F_DUPFD, args->arg)); case LINUX_F_GETFD: return (kern_fcntl(td, args->fd, F_GETFD, 0)); case LINUX_F_SETFD: return (kern_fcntl(td, args->fd, F_SETFD, args->arg)); case LINUX_F_GETFL: error = kern_fcntl(td, args->fd, F_GETFL, 0); result = td->td_retval[0]; td->td_retval[0] = 0; if (result & O_RDONLY) td->td_retval[0] |= LINUX_O_RDONLY; if (result & O_WRONLY) td->td_retval[0] |= LINUX_O_WRONLY; if (result & O_RDWR) td->td_retval[0] |= LINUX_O_RDWR; if (result & O_NDELAY) td->td_retval[0] |= LINUX_O_NONBLOCK; if (result & O_APPEND) td->td_retval[0] |= LINUX_O_APPEND; if (result & O_FSYNC) td->td_retval[0] |= LINUX_O_SYNC; if (result & O_ASYNC) td->td_retval[0] |= LINUX_O_ASYNC; #ifdef LINUX_O_NOFOLLOW if (result & O_NOFOLLOW) td->td_retval[0] |= LINUX_O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (result & O_DIRECT) td->td_retval[0] |= LINUX_O_DIRECT; #endif return (error); case LINUX_F_SETFL: arg = 0; if (args->arg & LINUX_O_NDELAY) arg |= O_NONBLOCK; if (args->arg & LINUX_O_APPEND) arg |= O_APPEND; if (args->arg & LINUX_O_SYNC) arg |= O_FSYNC; if (args->arg & LINUX_O_ASYNC) arg |= O_ASYNC; #ifdef LINUX_O_NOFOLLOW if (args->arg & LINUX_O_NOFOLLOW) arg |= O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (args->arg & LINUX_O_DIRECT) arg |= O_DIRECT; #endif return (kern_fcntl(td, args->fd, F_SETFL, arg)); case LINUX_F_GETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); case LINUX_F_GETOWN: return (kern_fcntl(td, args->fd, F_GETOWN, 0)); case LINUX_F_SETOWN: /* * XXX some Linux applications depend on F_SETOWN having no * significant effect for pipes (SIGIO is not delivered for * pipes under Linux-2.2.35 at least). */ error = fget(td, args->fd, &cap_fcntl_rights, &fp); if (error) return (error); if (fp->f_type == DTYPE_PIPE) { fdrop(fp, td); return (EINVAL); } fdrop(fp, td); return (kern_fcntl(td, args->fd, F_SETOWN, args->arg)); case LINUX_F_DUPFD_CLOEXEC: return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg)); /* * Our F_SEAL_* values match Linux one for maximum compatibility. So we * only needed to account for different values for fcntl(2) commands. */ case LINUX_F_GET_SEALS: error = kern_fcntl(td, args->fd, F_GET_SEALS, 0); if (error != 0) return (error); td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0], seal_bitmap, 0); return (0); case LINUX_F_ADD_SEALS: return (kern_fcntl(td, args->fd, F_ADD_SEALS, linux_to_bsd_bits(args->arg, seal_bitmap, 0))); case LINUX_F_GETPIPE_SZ: error = fget(td, args->fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_PIPE) { fdrop(fp, td); return (EINVAL); } fpipe = fp->f_data; td->td_retval[0] = fpipe->pipe_buffer.size; fdrop(fp, td); return (0); default: linux_msg(td, "unsupported fcntl cmd %d", args->cmd); return (EINVAL); } } int linux_fcntl(struct thread *td, struct linux_fcntl_args *args) { return (fcntl_common(td, args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args) { struct l_flock64 linux_flock; struct flock bsd_flock; struct linux_fcntl_args fcntl_args; int error; switch (args->cmd) { case LINUX_F_GETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock64(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); } fcntl_args.fd = args->fd; fcntl_args.cmd = args->cmd; fcntl_args.arg = args->arg; return (fcntl_common(td, &fcntl_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ #ifdef LINUX_LEGACY_SYSCALLS int linux_chown(struct thread *td, struct linux_chown_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, - args->uid, args->gid, 0)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, args->uid, - args->gid, 0); - LFREEPATH(path); - return (error); + return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, + args->uid, args->gid, 0)); } #endif int linux_fchownat(struct thread *td, struct linux_fchownat_args *args) { - char *path; - int error, dfd, flag, unsupported; + int dfd, flag, unsupported; unsupported = args->flag & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH); if (unsupported != 0) { linux_msg(td, "fchownat unsupported flag 0x%x", unsupported); return (EINVAL); } flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) == 0 ? 0 : AT_SYMLINK_NOFOLLOW; flag |= (args->flag & LINUX_AT_EMPTY_PATH) == 0 ? 0 : AT_EMPTY_PATH; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - return (kern_fchownat(td, dfd, args->filename, UIO_USERSPACE, - args->uid, args->gid, flag)); - } - LCONVPATHEXIST_AT(args->filename, &path, dfd); - error = kern_fchownat(td, dfd, path, UIO_SYSSPACE, args->uid, args->gid, - flag); - LFREEPATH(path); - return (error); + return (kern_fchownat(td, dfd, args->filename, UIO_USERSPACE, + args->uid, args->gid, flag)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_lchown(struct thread *td, struct linux_lchown_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td)) { - return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->uid, - args->gid, AT_SYMLINK_NOFOLLOW)); - } - LCONVPATHEXIST(args->path, &path); - error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, args->uid, args->gid, - AT_SYMLINK_NOFOLLOW); - LFREEPATH(path); - return (error); + return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->uid, + args->gid, AT_SYMLINK_NOFOLLOW)); } #endif static int convert_fadvice(int advice) { switch (advice) { case LINUX_POSIX_FADV_NORMAL: return (POSIX_FADV_NORMAL); case LINUX_POSIX_FADV_RANDOM: return (POSIX_FADV_RANDOM); case LINUX_POSIX_FADV_SEQUENTIAL: return (POSIX_FADV_SEQUENTIAL); case LINUX_POSIX_FADV_WILLNEED: return (POSIX_FADV_WILLNEED); case LINUX_POSIX_FADV_DONTNEED: return (POSIX_FADV_DONTNEED); case LINUX_POSIX_FADV_NOREUSE: return (POSIX_FADV_NOREUSE); default: return (-1); } } int linux_fadvise64(struct thread *td, struct linux_fadvise64_args *args) { off_t offset; int advice; #if defined(__amd64__) && defined(COMPAT_LINUX32) offset = PAIR32TO64(off_t, args->offset); #else offset = args->offset; #endif advice = convert_fadvice(args->advice); if (advice == -1) return (EINVAL); return (kern_posix_fadvise(td, args->fd, offset, args->len, advice)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_fadvise64_64(struct thread *td, struct linux_fadvise64_64_args *args) { off_t len, offset; int advice; #if defined(__amd64__) && defined(COMPAT_LINUX32) len = PAIR32TO64(off_t, args->len); offset = PAIR32TO64(off_t, args->offset); #else len = args->len; offset = args->offset; #endif advice = convert_fadvice(args->advice); if (advice == -1) return (EINVAL); return (kern_posix_fadvise(td, args->fd, offset, len, advice)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ #ifdef LINUX_LEGACY_SYSCALLS int linux_pipe(struct thread *td, struct linux_pipe_args *args) { int fildes[2]; int error; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error != 0) return (error); error = copyout(fildes, args->pipefds, sizeof(fildes)); if (error != 0) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } #endif int linux_pipe2(struct thread *td, struct linux_pipe2_args *args) { int fildes[2]; int error, flags; if ((args->flags & ~(LINUX_O_NONBLOCK | LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_NONBLOCK) != 0) flags |= O_NONBLOCK; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; error = kern_pipe(td, fildes, flags, NULL, NULL); if (error != 0) return (error); error = copyout(fildes, args->pipefds, sizeof(fildes)); if (error != 0) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } int linux_dup3(struct thread *td, struct linux_dup3_args *args) { int cmd; intptr_t newfd; if (args->oldfd == args->newfd) return (EINVAL); if ((args->flags & ~LINUX_O_CLOEXEC) != 0) return (EINVAL); if (args->flags & LINUX_O_CLOEXEC) cmd = F_DUP2FD_CLOEXEC; else cmd = F_DUP2FD; newfd = args->newfd; return (kern_fcntl(td, args->oldfd, cmd, newfd)); } int linux_fallocate(struct thread *td, struct linux_fallocate_args *args) { off_t len, offset; /* * We emulate only posix_fallocate system call for which * mode should be 0. */ if (args->mode != 0) return (EOPNOTSUPP); #if defined(__amd64__) && defined(COMPAT_LINUX32) len = PAIR32TO64(off_t, args->len); offset = PAIR32TO64(off_t, args->offset); #else len = args->len; offset = args->offset; #endif return (kern_posix_fallocate(td, args->fd, offset, len)); } int linux_copy_file_range(struct thread *td, struct linux_copy_file_range_args *args) { l_loff_t inoff, outoff, *inoffp, *outoffp; int error, flags; /* * copy_file_range(2) on Linux doesn't define any flags (yet), so is * the native implementation. Enforce it. */ if (args->flags != 0) { linux_msg(td, "copy_file_range unsupported flags 0x%x", args->flags); return (EINVAL); } flags = 0; inoffp = outoffp = NULL; if (args->off_in != NULL) { error = copyin(args->off_in, &inoff, sizeof(l_loff_t)); if (error != 0) return (error); inoffp = &inoff; } if (args->off_out != NULL) { error = copyin(args->off_out, &outoff, sizeof(l_loff_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, args->fd_in, inoffp, args->fd_out, outoffp, args->len, flags); if (error == 0 && args->off_in != NULL) error = copyout(inoffp, args->off_in, sizeof(l_loff_t)); if (error == 0 && args->off_out != NULL) error = copyout(outoffp, args->off_out, sizeof(l_loff_t)); return (error); } #define LINUX_MEMFD_PREFIX "memfd:" int linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args) { char memfd_name[LINUX_NAME_MAX + 1]; int error, flags, shmflags, oflags; /* * This is our clever trick to avoid the heap allocation to copy in the * uname. We don't really need to go this far out of our way, but it * does keep the rest of this function fairly clean as they don't have * to worry about cleanup on the way out. */ error = copyinstr(args->uname_ptr, memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1, LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1); flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0); if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_HUGE_MASK)) != 0) return (EINVAL); /* Size specified but no HUGETLB. */ if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) return (EINVAL); /* We don't actually support HUGETLB. */ if ((flags & MFD_HUGETLB) != 0) return (ENOSYS); oflags = O_RDWR; shmflags = SHM_GROW_ON_WRITE; if ((flags & MFD_CLOEXEC) != 0) oflags |= O_CLOEXEC; if ((flags & MFD_ALLOW_SEALING) != 0) shmflags |= SHM_ALLOW_SEALING; return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL, memfd_name)); } int linux_splice(struct thread *td, struct linux_splice_args *args) { linux_msg(td, "syscall splice not really implemented"); /* * splice(2) is documented to return EINVAL in various circumstances; * returning it instead of ENOSYS should hint the caller to use fallback * instead. */ return (EINVAL); } int linux_close_range(struct thread *td, struct linux_close_range_args *args) { u_int flags = 0; /* * Implementing close_range(CLOSE_RANGE_UNSHARE) allows Linux to * unshare filedesc table of the calling thread from others threads * in a thread group (i.e., process in the FreeBSD) or others processes, * which shares the same table, before closing the files. FreeBSD does * not have compatible unsharing mechanism due to the fact that sharing * process resources, including filedesc table, is at thread level in the * Linux, while in the FreeBSD it is at the process level. * Return EINVAL for now if the CLOSE_RANGE_UNSHARE flag is specified * until this new Linux API stabilizes. */ if ((args->flags & ~(LINUX_CLOSE_RANGE_CLOEXEC)) != 0) return (EINVAL); if (args->first > args->last) return (EINVAL); if ((args->flags & LINUX_CLOSE_RANGE_CLOEXEC) != 0) flags |= CLOSE_RANGE_CLOEXEC; return (kern_close_range(td, flags, args->first, args->last)); } diff --git a/sys/compat/linux/linux_mib.c b/sys/compat/linux/linux_mib.c index 29e86ea1f07b..714225ef9f1c 100644 --- a/sys/compat/linux/linux_mib.c +++ b/sys/compat/linux/linux_mib.c @@ -1,587 +1,583 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include struct linux_prison { char pr_osname[LINUX_MAX_UTSNAME]; char pr_osrelease[LINUX_MAX_UTSNAME]; int pr_oss_version; int pr_osrel; }; static struct linux_prison lprison0 = { .pr_osname = "Linux", .pr_osrelease = LINUX_VERSION_STR, .pr_oss_version = 0x030600, .pr_osrel = LINUX_VERSION_CODE }; static unsigned linux_osd_jail_slot; SYSCTL_NODE(_compat, OID_AUTO, linux, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Linux mode"); int linux_debug = 3; SYSCTL_INT(_compat_linux, OID_AUTO, debug, CTLFLAG_RWTUN, &linux_debug, 0, "Log warnings from linux(4); or 0 to disable"); int linux_default_openfiles = 1024; SYSCTL_INT(_compat_linux, OID_AUTO, default_openfiles, CTLFLAG_RWTUN, &linux_default_openfiles, 0, "Default soft openfiles resource limit, or -1 for unlimited"); int linux_default_stacksize = 8 * 1024 * 1024; SYSCTL_INT(_compat_linux, OID_AUTO, default_stacksize, CTLFLAG_RWTUN, &linux_default_stacksize, 0, "Default soft stack size resource limit, or -1 for unlimited"); int linux_dummy_rlimits = 0; SYSCTL_INT(_compat_linux, OID_AUTO, dummy_rlimits, CTLFLAG_RWTUN, &linux_dummy_rlimits, 0, "Return dummy values for unsupported Linux-specific rlimits"); int linux_ignore_ip_recverr = 1; SYSCTL_INT(_compat_linux, OID_AUTO, ignore_ip_recverr, CTLFLAG_RWTUN, &linux_ignore_ip_recverr, 0, "Ignore enabling IP_RECVERR"); int linux_preserve_vstatus = 1; SYSCTL_INT(_compat_linux, OID_AUTO, preserve_vstatus, CTLFLAG_RWTUN, &linux_preserve_vstatus, 0, "Preserve VSTATUS termios(4) flag"); bool linux_map_sched_prio = true; SYSCTL_BOOL(_compat_linux, OID_AUTO, map_sched_prio, CTLFLAG_RDTUN, &linux_map_sched_prio, 0, "Map scheduler priorities to Linux priorities " "(not POSIX compliant)"); -int linux_use_emul_path = 1; -SYSCTL_INT(_compat_linux, OID_AUTO, use_emul_path, CTLFLAG_RWTUN, - &linux_use_emul_path, 0, "Use linux.compat.emul_path"); - static bool linux_setid_allowed = true; SYSCTL_BOOL(_compat_linux, OID_AUTO, setid_allowed, CTLFLAG_RWTUN, &linux_setid_allowed, 0, "Allow setuid/setgid on execve of Linux binary"); int linux_setid_allowed_query(struct thread *td __unused, struct image_params *imgp __unused) { return (linux_setid_allowed); } static int linux_set_osname(struct thread *td, char *osname); static int linux_set_osrelease(struct thread *td, char *osrelease); static int linux_set_oss_version(struct thread *td, int oss_version); static int linux_sysctl_osname(SYSCTL_HANDLER_ARGS) { char osname[LINUX_MAX_UTSNAME]; int error; linux_get_osname(req->td, osname); error = sysctl_handle_string(oidp, osname, LINUX_MAX_UTSNAME, req); if (error != 0 || req->newptr == NULL) return (error); error = linux_set_osname(req->td, osname); return (error); } SYSCTL_PROC(_compat_linux, OID_AUTO, osname, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, linux_sysctl_osname, "A", "Linux kernel OS name"); static int linux_sysctl_osrelease(SYSCTL_HANDLER_ARGS) { char osrelease[LINUX_MAX_UTSNAME]; int error; linux_get_osrelease(req->td, osrelease); error = sysctl_handle_string(oidp, osrelease, LINUX_MAX_UTSNAME, req); if (error != 0 || req->newptr == NULL) return (error); error = linux_set_osrelease(req->td, osrelease); return (error); } SYSCTL_PROC(_compat_linux, OID_AUTO, osrelease, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, linux_sysctl_osrelease, "A", "Linux kernel OS release"); static int linux_sysctl_oss_version(SYSCTL_HANDLER_ARGS) { int oss_version; int error; oss_version = linux_get_oss_version(req->td); error = sysctl_handle_int(oidp, &oss_version, 0, req); if (error != 0 || req->newptr == NULL) return (error); error = linux_set_oss_version(req->td, oss_version); return (error); } SYSCTL_PROC(_compat_linux, OID_AUTO, oss_version, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, linux_sysctl_oss_version, "I", "Linux OSS version"); /* * Map the osrelease into integer */ static int linux_map_osrel(char *osrelease, int *osrel) { char *sep, *eosrelease; int len, v0, v1, v2, v; len = strlen(osrelease); eosrelease = osrelease + len; v0 = strtol(osrelease, &sep, 10); if (osrelease == sep || sep + 1 >= eosrelease || *sep != '.') return (EINVAL); osrelease = sep + 1; v1 = strtol(osrelease, &sep, 10); if (osrelease == sep || sep + 1 >= eosrelease || *sep != '.') return (EINVAL); osrelease = sep + 1; v2 = strtol(osrelease, &sep, 10); if (osrelease == sep || (sep != eosrelease && (sep + 1 >= eosrelease || *sep != '-'))) return (EINVAL); v = LINUX_KERNVER(v0, v1, v2); if (v < LINUX_KERNVER(1, 0, 0)) return (EINVAL); if (osrel != NULL) *osrel = v; return (0); } /* * Find a prison with Linux info. * Return the Linux info and the (locked) prison. */ static struct linux_prison * linux_find_prison(struct prison *spr, struct prison **prp) { struct prison *pr; struct linux_prison *lpr; for (pr = spr;; pr = pr->pr_parent) { mtx_lock(&pr->pr_mtx); lpr = (pr == &prison0) ? &lprison0 : osd_jail_get(pr, linux_osd_jail_slot); if (lpr != NULL) break; mtx_unlock(&pr->pr_mtx); } *prp = pr; return (lpr); } /* * Ensure a prison has its own Linux info. If lprp is non-null, point it to * the Linux info and lock the prison. */ static void linux_alloc_prison(struct prison *pr, struct linux_prison **lprp) { struct prison *ppr; struct linux_prison *lpr, *nlpr; void **rsv; /* If this prison already has Linux info, return that. */ lpr = linux_find_prison(pr, &ppr); if (ppr == pr) goto done; /* * Allocate a new info record. Then check again, in case something * changed during the allocation. */ mtx_unlock(&ppr->pr_mtx); nlpr = malloc(sizeof(struct linux_prison), M_PRISON, M_WAITOK); rsv = osd_reserve(linux_osd_jail_slot); lpr = linux_find_prison(pr, &ppr); if (ppr == pr) { free(nlpr, M_PRISON); osd_free_reserved(rsv); goto done; } /* Inherit the initial values from the ancestor. */ mtx_lock(&pr->pr_mtx); (void)osd_jail_set_reserved(pr, linux_osd_jail_slot, rsv, nlpr); bcopy(lpr, nlpr, sizeof(*lpr)); lpr = nlpr; mtx_unlock(&ppr->pr_mtx); done: if (lprp != NULL) *lprp = lpr; else mtx_unlock(&pr->pr_mtx); } /* * Jail OSD methods for Linux prison data. */ static int linux_prison_create(void *obj, void *data) { struct prison *pr = obj; struct vfsoptlist *opts = data; int jsys; if (vfs_copyopt(opts, "linux", &jsys, sizeof(jsys)) == 0 && jsys == JAIL_SYS_INHERIT) return (0); /* * Inherit a prison's initial values from its parent * (different from JAIL_SYS_INHERIT which also inherits changes). */ linux_alloc_prison(pr, NULL); return (0); } static int linux_prison_check(void *obj __unused, void *data) { struct vfsoptlist *opts = data; char *osname, *osrelease; int error, jsys, len, oss_version; /* Check that the parameters are correct. */ error = vfs_copyopt(opts, "linux", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) return (EINVAL); } error = vfs_getopt(opts, "linux.osname", (void **)&osname, &len); if (error != ENOENT) { if (error != 0) return (error); if (len == 0 || osname[len - 1] != '\0') return (EINVAL); if (len > LINUX_MAX_UTSNAME) { vfs_opterror(opts, "linux.osname too long"); return (ENAMETOOLONG); } } error = vfs_getopt(opts, "linux.osrelease", (void **)&osrelease, &len); if (error != ENOENT) { if (error != 0) return (error); if (len == 0 || osrelease[len - 1] != '\0') return (EINVAL); if (len > LINUX_MAX_UTSNAME) { vfs_opterror(opts, "linux.osrelease too long"); return (ENAMETOOLONG); } error = linux_map_osrel(osrelease, NULL); if (error != 0) { vfs_opterror(opts, "linux.osrelease format error"); return (error); } } error = vfs_copyopt(opts, "linux.oss_version", &oss_version, sizeof(oss_version)); if (error == ENOENT) error = 0; return (error); } static int linux_prison_set(void *obj, void *data) { struct linux_prison *lpr; struct prison *pr = obj; struct vfsoptlist *opts = data; char *osname, *osrelease; int error, gotversion, jsys, len, oss_version; /* Set the parameters, which should be correct. */ error = vfs_copyopt(opts, "linux", &jsys, sizeof(jsys)); if (error == ENOENT) jsys = -1; error = vfs_getopt(opts, "linux.osname", (void **)&osname, &len); if (error == ENOENT) osname = NULL; else jsys = JAIL_SYS_NEW; error = vfs_getopt(opts, "linux.osrelease", (void **)&osrelease, &len); if (error == ENOENT) osrelease = NULL; else jsys = JAIL_SYS_NEW; error = vfs_copyopt(opts, "linux.oss_version", &oss_version, sizeof(oss_version)); if (error == ENOENT) gotversion = 0; else { gotversion = 1; jsys = JAIL_SYS_NEW; } switch (jsys) { case JAIL_SYS_INHERIT: /* "linux=inherit": inherit the parent's Linux info. */ mtx_lock(&pr->pr_mtx); osd_jail_del(pr, linux_osd_jail_slot); mtx_unlock(&pr->pr_mtx); break; case JAIL_SYS_NEW: /* * "linux=new" or "linux.*": * the prison gets its own Linux info. */ linux_alloc_prison(pr, &lpr); if (osrelease) { (void)linux_map_osrel(osrelease, &lpr->pr_osrel); strlcpy(lpr->pr_osrelease, osrelease, LINUX_MAX_UTSNAME); } if (osname) strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME); if (gotversion) lpr->pr_oss_version = oss_version; mtx_unlock(&pr->pr_mtx); } return (0); } SYSCTL_JAIL_PARAM_SYS_NODE(linux, CTLFLAG_RW, "Jail Linux parameters"); SYSCTL_JAIL_PARAM_STRING(_linux, osname, CTLFLAG_RW, LINUX_MAX_UTSNAME, "Jail Linux kernel OS name"); SYSCTL_JAIL_PARAM_STRING(_linux, osrelease, CTLFLAG_RW, LINUX_MAX_UTSNAME, "Jail Linux kernel OS release"); SYSCTL_JAIL_PARAM(_linux, oss_version, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail Linux OSS version"); static int linux_prison_get(void *obj, void *data) { struct linux_prison *lpr; struct prison *ppr; struct prison *pr = obj; struct vfsoptlist *opts = data; int error, i; static int version0; /* See if this prison is the one with the Linux info. */ lpr = linux_find_prison(pr, &ppr); i = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "linux", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; if (i) { error = vfs_setopts(opts, "linux.osname", lpr->pr_osname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "linux.osrelease", lpr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "linux.oss_version", &lpr->pr_oss_version, sizeof(lpr->pr_oss_version)); if (error != 0 && error != ENOENT) goto done; } else { /* * If this prison is inheriting its Linux info, report * empty/zero parameters. */ error = vfs_setopts(opts, "linux.osname", ""); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "linux.osrelease", ""); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "linux.oss_version", &version0, sizeof(lpr->pr_oss_version)); if (error != 0 && error != ENOENT) goto done; } error = 0; done: mtx_unlock(&ppr->pr_mtx); return (error); } static void linux_prison_destructor(void *data) { free(data, M_PRISON); } void linux_osd_jail_register(void) { struct prison *pr; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CREATE] = linux_prison_create, [PR_METHOD_GET] = linux_prison_get, [PR_METHOD_SET] = linux_prison_set, [PR_METHOD_CHECK] = linux_prison_check }; linux_osd_jail_slot = osd_jail_register(linux_prison_destructor, methods); /* Copy the system Linux info to any current prisons. */ sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) linux_alloc_prison(pr, NULL); sx_sunlock(&allprison_lock); } void linux_osd_jail_deregister(void) { osd_jail_deregister(linux_osd_jail_slot); } void linux_get_osname(struct thread *td, char *dst) { struct prison *pr; struct linux_prison *lpr; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); bcopy(lpr->pr_osname, dst, LINUX_MAX_UTSNAME); mtx_unlock(&pr->pr_mtx); } static int linux_set_osname(struct thread *td, char *osname) { struct prison *pr; struct linux_prison *lpr; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME); mtx_unlock(&pr->pr_mtx); return (0); } void linux_get_osrelease(struct thread *td, char *dst) { struct prison *pr; struct linux_prison *lpr; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); bcopy(lpr->pr_osrelease, dst, LINUX_MAX_UTSNAME); mtx_unlock(&pr->pr_mtx); } int linux_kernver(struct thread *td) { struct prison *pr; struct linux_prison *lpr; int osrel; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); osrel = lpr->pr_osrel; mtx_unlock(&pr->pr_mtx); return (osrel); } static int linux_set_osrelease(struct thread *td, char *osrelease) { struct prison *pr; struct linux_prison *lpr; int error; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); error = linux_map_osrel(osrelease, &lpr->pr_osrel); if (error == 0) strlcpy(lpr->pr_osrelease, osrelease, LINUX_MAX_UTSNAME); mtx_unlock(&pr->pr_mtx); return (error); } int linux_get_oss_version(struct thread *td) { struct prison *pr; struct linux_prison *lpr; int version; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); version = lpr->pr_oss_version; mtx_unlock(&pr->pr_mtx); return (version); } static int linux_set_oss_version(struct thread *td, int oss_version) { struct prison *pr; struct linux_prison *lpr; lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); lpr->pr_oss_version = oss_version; mtx_unlock(&pr->pr_mtx); return (0); } diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c index bc6ff9559493..b5d48d106be6 100644 --- a/sys/compat/linux/linux_misc.c +++ b/sys/compat/linux/linux_misc.c @@ -1,2650 +1,2574 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include #include int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalhigh; l_ulong freehigh; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; struct l_pselect6arg { l_uintptr_t ss; l_size_t ss_len; }; static int linux_utimensat_lts_to_ts(struct l_timespec *, struct timespec *); #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int linux_utimensat_lts64_to_ts(struct l_timespec64 *, struct timespec *); #endif static int linux_common_utimensat(struct thread *, int, const char *, struct timespec *, int); static int linux_common_pselect6(struct thread *, l_int, l_fd_set *, l_fd_set *, l_fd_set *, struct timespec *, l_uintptr_t *); static int linux_common_ppoll(struct thread *, struct pollfd *, uint32_t, struct timespec *, l_sigset_t *, l_size_t); static int linux_pollin(struct thread *, struct pollfd *, struct pollfd *, u_int); static int linux_pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; int i, j; struct timespec ts; bzero(&sysinfo, sizeof(sysinfo)); getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE; /* * sharedram counts pages allocated to named, swap-backed objects such * as shared memory segments and tmpfs files. There is no cheap way to * compute this, so just leave the field unpopulated. Linux itself only * started setting this field in the 3.x timeframe. */ sysinfo.sharedram = 0; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* * Platforms supported by the emulation layer do not have a notion of * high memory. */ sysinfo.totalhigh = 0; sysinfo.freehigh = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } #ifdef LINUX_LEGACY_SYSCALLS int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error __diagused; secs = args->secs; /* * Linux alarm() is always successful. Limit secs to INT32_MAX / 2 * to match kern_setitimer()'s limit to avoid error from it. * * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit * platforms. */ if (secs > INT32_MAX / 2) secs = INT32_MAX / 2; it.it_value.tv_sec = secs; it.it_value.tv_usec = 0; timevalclear(&it.it_interval); error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); KASSERT(error == 0, ("kern_setitimer returns %d", error)); if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) || old_it.it_value.tv_usec >= 500000) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; return (0); } #endif int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; uintptr_t new, old; old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (uintptr_t)args->dsend; if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new)) td->td_retval[0] = (register_t)new; else td->td_retval[0] = (register_t)old; return (0); } #ifdef LINUX_LEGACY_SYSCALLS int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, LINUX_NFDBITS); if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: return (error); } #endif int linux_mremap(struct thread *td, struct linux_mremap_args *args) { uintptr_t addr; size_t len; int error = 0; if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { addr = args->addr + args->new_len; len = args->old_len - args->new_len; error = kern_munmap(td, addr, len); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { return (kern_msync(td, args->addr, args->len, args->fl & ~LINUX_MS_SYNC)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } #endif struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER(2,4,0) ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } #if defined(__amd64__) /* * On amd64, Linux uname(2) needs to return "x86_64" * for both 64-bit and 32-bit applications. On 32-bit, * the string returned by getauxval(AT_PLATFORM) needs * to remain "i686", though. */ #if defined(COMPAT_LINUX32) if (linux32_emulate_i386) strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME); else #endif strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME); #elif defined(__aarch64__) strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME); #elif defined(__i386__) strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME); #endif return (copyout(&utsname, args->buf, sizeof(utsname))); } struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; #ifdef LINUX_LEGACY_SYSCALLS int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; - char *fname; int error; if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut)) != 0) return (error); tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; - if (!LUSECONVPATH(td)) { - error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, - tvp, UIO_SYSSPACE); - } else { - LCONVPATHEXIST(args->fname, &fname); - error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, - UIO_SYSSPACE); - LFREEPATH(fname); - } - return (error); + return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, + tvp, UIO_SYSSPACE)); } #endif #ifdef LINUX_LEGACY_SYSCALLS int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; - char *fname; int error; if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0) return (error); tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } - if (!LUSECONVPATH(td)) { - error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, - tvp, UIO_SYSSPACE); - } else { - LCONVPATHEXIST(args->fname, &fname); - error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, - tvp, UIO_SYSSPACE); - LFREEPATH(fname); - } - return (error); + return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, + tvp, UIO_SYSSPACE)); } #endif static int linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times) { if (l_times->tv_nsec != LINUX_UTIME_OMIT && l_times->tv_nsec != LINUX_UTIME_NOW && (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999)) return (EINVAL); times->tv_sec = l_times->tv_sec; switch (l_times->tv_nsec) { case LINUX_UTIME_OMIT: times->tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times->tv_nsec = UTIME_NOW; break; default: times->tv_nsec = l_times->tv_nsec; } return (0); } static int linux_common_utimensat(struct thread *td, int ldfd, const char *pathname, struct timespec *timesp, int lflags) { - char *path = NULL; - int error, dfd, flags = 0; + int dfd, flags = 0; dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd; if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH)) return (EINVAL); if (timesp != NULL) { /* This breaks POSIX, but is what the Linux kernel does * _on purpose_ (documented in the man page for utimensat(2)), * so we must follow that behaviour. */ if (timesp[0].tv_nsec == UTIME_OMIT && timesp[1].tv_nsec == UTIME_OMIT) return (0); } if (lflags & LINUX_AT_SYMLINK_NOFOLLOW) flags |= AT_SYMLINK_NOFOLLOW; if (lflags & LINUX_AT_EMPTY_PATH) flags |= AT_EMPTY_PATH; - if (!LUSECONVPATH(td)) { - if (pathname != NULL) { - return (kern_utimensat(td, dfd, pathname, - UIO_USERSPACE, timesp, UIO_SYSSPACE, flags)); - } - } - if (pathname != NULL) - LCONVPATHEXIST_AT(pathname, &path, dfd); - else if (lflags != 0) - return (EINVAL); + return (kern_utimensat(td, dfd, pathname, + UIO_USERSPACE, timesp, UIO_SYSSPACE, flags)); - if (path == NULL) - error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE); - else { - error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp, - UIO_SYSSPACE, flags); - LFREEPATH(path); - } + if (lflags != 0) + return (EINVAL); - return (error); + return (kern_futimens(td, dfd, timesp, UIO_SYSSPACE)); } int linux_utimensat(struct thread *td, struct linux_utimensat_args *args) { struct l_timespec l_times[2]; struct timespec times[2], *timesp; int error; if (args->times != NULL) { error = copyin(args->times, l_times, sizeof(l_times)); if (error != 0) return (error); error = linux_utimensat_lts_to_ts(&l_times[0], ×[0]); if (error != 0) return (error); error = linux_utimensat_lts_to_ts(&l_times[1], ×[1]); if (error != 0) return (error); timesp = times; } else timesp = NULL; return (linux_common_utimensat(td, args->dfd, args->pathname, timesp, args->flags)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times) { /* Zero out the padding in compat mode. */ l_times->tv_nsec &= 0xFFFFFFFFUL; if (l_times->tv_nsec != LINUX_UTIME_OMIT && l_times->tv_nsec != LINUX_UTIME_NOW && (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999)) return (EINVAL); times->tv_sec = l_times->tv_sec; switch (l_times->tv_nsec) { case LINUX_UTIME_OMIT: times->tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times->tv_nsec = UTIME_NOW; break; default: times->tv_nsec = l_times->tv_nsec; } return (0); } int linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args) { struct l_timespec64 l_times[2]; struct timespec times[2], *timesp; int error; if (args->times64 != NULL) { error = copyin(args->times64, l_times, sizeof(l_times)); if (error != 0) return (error); error = linux_utimensat_lts64_to_ts(&l_times[0], ×[0]); if (error != 0) return (error); error = linux_utimensat_lts64_to_ts(&l_times[1], ×[1]); if (error != 0) return (error); timesp = times; } else timesp = NULL; return (linux_common_utimensat(td, args->dfd, args->pathname, timesp, args->flags)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ #ifdef LINUX_LEGACY_SYSCALLS int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; - char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0) return (error); tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } - if (!LUSECONVPATH(td)) { - error = kern_utimesat(td, dfd, args->filename, UIO_USERSPACE, - tvp, UIO_SYSSPACE); - } else { - LCONVPATHEXIST_AT(args->filename, &fname, dfd); - error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, - tvp, UIO_SYSSPACE); - LFREEPATH(fname); - } - return (error); + return (kern_utimesat(td, dfd, args->filename, UIO_USERSPACE, + tvp, UIO_SYSSPACE)); } #endif static int linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp, int options, void *rup, l_siginfo_t *infop) { l_siginfo_t lsi; siginfo_t siginfo; struct __wrusage wru; int error, status, tmpstat, sig; error = kern_wait6(td, idtype, id, &status, options, rup != NULL ? &wru : NULL, &siginfo); if (error == 0 && statusp) { tmpstat = status & 0xffff; if (WIFSIGNALED(tmpstat)) { tmpstat = (tmpstat & 0xffffff80) | bsd_to_linux_signal(WTERMSIG(tmpstat)); } else if (WIFSTOPPED(tmpstat)) { tmpstat = (tmpstat & 0xffff00ff) | (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8); #if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32)) if (WSTOPSIG(status) == SIGTRAP) { tmpstat = linux_ptrace_status(td, siginfo.si_pid, tmpstat); } #endif } else if (WIFCONTINUED(tmpstat)) { tmpstat = 0xffff; } error = copyout(&tmpstat, statusp, sizeof(int)); } if (error == 0 && rup != NULL) error = linux_copyout_rusage(&wru.wru_self, rup); if (error == 0 && infop != NULL && td->td_retval[0] != 0) { sig = bsd_to_linux_signal(siginfo.si_signo); siginfo_to_lsiginfo(&siginfo, &lsi, sig); error = copyout(&lsi, infop, sizeof(lsi)); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { struct linux_wait4_args wait4_args = { .pid = args->pid, .status = args->status, .options = args->options, .rusage = NULL, }; return (linux_wait4(td, &wait4_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_wait4(struct thread *td, struct linux_wait4_args *args) { struct proc *p; int options, id, idtype; if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); /* -INT_MIN is not defined. */ if (args->pid == INT_MIN) return (ESRCH); options = 0; linux_to_bsd_waitopts(args->options, &options); /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; if (args->pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (args->pid < 0) { idtype = P_PGID; id = (id_t)-args->pid; } else if (args->pid == 0) { idtype = P_PGID; p = td->td_proc; PROC_LOCK(p); id = p->p_pgid; PROC_UNLOCK(p); } else { idtype = P_PID; id = (id_t)args->pid; } return (linux_common_wait(td, idtype, id, args->status, options, args->rusage, NULL)); } int linux_waitid(struct thread *td, struct linux_waitid_args *args) { idtype_t idtype; int error, options; struct proc *p; pid_t id; if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED | LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); options = 0; linux_to_bsd_waitopts(args->options, &options); id = args->id; switch (args->idtype) { case LINUX_P_ALL: idtype = P_ALL; break; case LINUX_P_PID: if (args->id <= 0) return (EINVAL); idtype = P_PID; break; case LINUX_P_PGID: if (linux_kernver(td) >= LINUX_KERNVER(5,4,0) && args->id == 0) { p = td->td_proc; PROC_LOCK(p); id = p->p_pgid; PROC_UNLOCK(p); } else if (args->id <= 0) return (EINVAL); idtype = P_PGID; break; case LINUX_P_PIDFD: LINUX_RATELIMIT_MSG("unsupported waitid P_PIDFD idtype"); return (ENOSYS); default: return (EINVAL); } error = linux_common_wait(td, idtype, id, NULL, options, args->rusage, args->info); td->td_retval[0] = 0; return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_mknod(struct thread *td, struct linux_mknod_args *args) { - char *path; int error; - enum uio_seg seg; - bool convpath; - - convpath = LUSECONVPATH(td); - if (!convpath) { - path = args->path; - seg = UIO_USERSPACE; - } else { - LCONVPATHCREAT(args->path, &path); - seg = UIO_SYSSPACE; - } switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: - error = kern_mkfifoat(td, AT_FDCWD, path, seg, + error = kern_mkfifoat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: - error = kern_mknodat(td, AT_FDCWD, path, seg, + error = kern_mknodat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->mode, linux_decode_dev(args->dev)); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: - error = kern_openat(td, AT_FDCWD, path, seg, + error = kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } - if (convpath) - LFREEPATH(path); return (error); } #endif int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { - char *path; int error, dfd; - enum uio_seg seg; - bool convpath; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - convpath = LUSECONVPATH(td); - if (!convpath) { - path = __DECONST(char *, args->filename); - seg = UIO_USERSPACE; - } else { - LCONVPATHCREAT_AT(args->filename, &path, dfd); - seg = UIO_SYSSPACE; - } - switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: - error = kern_mkfifoat(td, dfd, path, seg, args->mode); + error = kern_mkfifoat(td, dfd, args->filename, UIO_USERSPACE, + args->mode); break; case S_IFCHR: case S_IFBLK: - error = kern_mknodat(td, dfd, path, seg, args->mode, - linux_decode_dev(args->dev)); + error = kern_mknodat(td, dfd, args->filename, UIO_USERSPACE, + args->mode, linux_decode_dev(args->dev)); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: - error = kern_openat(td, dfd, path, seg, + error = kern_openat(td, dfd, args->filename, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } - if (convpath) - LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { struct linux_pemuldata *pem; struct proc *p = td->td_proc; uint32_t old; PROC_LOCK(p); pem = pem_find(p); old = pem->persona; if (args->per != 0xffffffff) pem->persona = args->per; PROC_UNLOCK(p); td->td_retval[0] = old; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_nice(struct thread *td, struct linux_nice_args *args) { return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); crextend(newcred, ngrp + 1); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; crcopy(newcred, oldcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_LINUX); if (error) return (error); td->td_retval[0] = ngrp; return (0); } static bool linux_get_dummy_limit(l_uint resource, struct rlimit *rlim) { if (linux_dummy_rlimits == 0) return (false); switch (resource) { case LINUX_RLIMIT_LOCKS: case LINUX_RLIMIT_SIGPENDING: case LINUX_RLIMIT_MSGQUEUE: case LINUX_RLIMIT_RTTIME: rlim->rlim_cur = LINUX_RLIM_INFINITY; rlim->rlim_max = LINUX_RLIM_INFINITY; return (true); case LINUX_RLIMIT_NICE: case LINUX_RLIMIT_RTPRIO: rlim->rlim_cur = 0; rlim->rlim_max = 0; return (true); default: return (false); } } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; if (linux_get_dummy_limit(args->resource, &bsd_rlim)) { rlim.rlim_cur = bsd_rlim.rlim_cur; rlim.rlim_max = bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; if (linux_get_dummy_limit(args->resource, &bsd_rlim)) { rlim.rlim_cur = bsd_rlim.rlim_cur; rlim.rlim_max = bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { struct sched_param sched_param; struct thread *tdt; int error, policy; switch (args->policy) { case LINUX_SCHED_OTHER: policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: policy = SCHED_FIFO; break; case LINUX_SCHED_RR: policy = SCHED_RR; break; default: return (EINVAL); } error = copyin(args->param, &sched_param, sizeof(sched_param)); if (error) return (error); if (linux_map_sched_prio) { switch (policy) { case SCHED_OTHER: if (sched_param.sched_priority != 0) return (EINVAL); sched_param.sched_priority = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE; break; case SCHED_FIFO: case SCHED_RR: if (sched_param.sched_priority < 1 || sched_param.sched_priority >= LINUX_MAX_RT_PRIO) return (EINVAL); /* * Map [1, LINUX_MAX_RT_PRIO - 1] to * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down). */ sched_param.sched_priority = (sched_param.sched_priority - 1) * (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) / (LINUX_MAX_RT_PRIO - 1); break; } } tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setscheduler(td, tdt, policy, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { struct thread *tdt; int error, policy; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; if (linux_map_sched_prio) { switch (args->policy) { case LINUX_SCHED_OTHER: td->td_retval[0] = 0; return (0); case LINUX_SCHED_FIFO: case LINUX_SCHED_RR: td->td_retval[0] = LINUX_MAX_RT_PRIO - 1; return (0); default: return (EINVAL); } } switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; if (linux_map_sched_prio) { switch (args->policy) { case LINUX_SCHED_OTHER: td->td_retval[0] = 0; return (0); case LINUX_SCHED_FIFO: case LINUX_SCHED_RR: td->td_retval[0] = 1; return (0); default: return (EINVAL); } } switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } int linux_getpid(struct thread *td, struct linux_getpid_args *args) { td->td_retval[0] = td->td_proc->p_pid; return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("gettid: emuldata not found.\n")); td->td_retval[0] = em->em_tid; return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { td->td_retval[0] = kern_getppid(td); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { return (kern_getsid(td, args->pid)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { int error; error = kern_getpriority(td, args->which, args->who); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, args->error_code, 0); /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION_1 0x19980330 #define _LINUX_CAPABILITY_VERSION_2 0x20071026 #define _LINUX_CAPABILITY_VERSION_3 0x20080522 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, u32s; if (uap->hdrp == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (uap->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ memset(&lucd, 0, u32s * sizeof(lucd[0])); error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0])); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, i, u32s; if (uap->hdrp == NULL || uap->datap == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0])); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ for (i = 0; i < u32s; i++) { if (lucd[i].effective || lucd[i].permitted || lucd[i].inheritable) { linux_msg(td, "capset[%d] effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", i, (int)lucd[i].effective, (int)lucd[i].permitted, (int)lucd[i].inheritable); return (EPERM); } } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size, arg; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; int pdeath_signal, trace_state; switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); pdeath_signal = linux_to_bsd_signal(args->arg2); return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL, &pdeath_signal)); case LINUX_PR_GET_PDEATHSIG: error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS, &pdeath_signal); if (error != 0) return (error); pdeath_signal = bsd_to_linux_signal(pdeath_signal); return (copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal))); /* * In Linux, this flag controls if set[gu]id processes can coredump. * There are additional semantics imposed on processes that cannot * coredump: * - Such processes can not be ptraced. * - There are some semantics around ownership of process-related files * in the /proc namespace. * * In FreeBSD, we can (and by default, do) disable setuid coredump * system-wide with 'sugid_coredump.' We control tracability on a * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag). * By happy coincidence, P2_NOTRACE also prevents coredumping. So the * procctl is roughly analogous to Linux's DUMPABLE. * * So, proxy these knobs to the corresponding PROC_TRACE setting. */ case LINUX_PR_GET_DUMPABLE: error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS, &trace_state); if (error != 0) return (error); td->td_retval[0] = (trace_state != -1); return (0); case LINUX_PR_SET_DUMPABLE: /* * It is only valid for userspace to set one of these two * flags, and only one at a time. */ switch (args->arg2) { case LINUX_SUID_DUMP_DISABLE: trace_state = PROC_TRACE_CTL_DISABLE_EXEC; break; case LINUX_SUID_DUMP_USER: trace_state = PROC_TRACE_CTL_ENABLE; break; default: return (EINVAL); } return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL, &trace_state)); case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a Linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; case LINUX_PR_GET_SECCOMP: case LINUX_PR_SET_SECCOMP: /* * Same as returned by Linux without CONFIG_SECCOMP enabled. */ error = EINVAL; break; case LINUX_PR_CAPBSET_READ: #if 0 /* * This makes too much noise with Ubuntu Focal. */ linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d", (int)args->arg2); #endif error = EINVAL; break; case LINUX_PR_SET_NO_NEW_PRIVS: arg = args->arg2 == 1 ? PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE; error = kern_procctl(td, P_PID, p->p_pid, PROC_NO_NEW_PRIVS_CTL, &arg); break; case LINUX_PR_SET_PTRACER: linux_msg(td, "unsupported prctl PR_SET_PTRACER"); error = EINVAL; break; default: linux_msg(td, "unsupported prctl option %d", args->option); error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error, policy; error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); if (linux_map_sched_prio) { error = kern_sched_getscheduler(td, tdt, &policy); if (error) goto out; switch (policy) { case SCHED_OTHER: if (sched_param.sched_priority != 0) { error = EINVAL; goto out; } sched_param.sched_priority = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE; break; case SCHED_FIFO: case SCHED_RR: if (sched_param.sched_priority < 1 || sched_param.sched_priority >= LINUX_MAX_RT_PRIO) { error = EINVAL; goto out; } /* * Map [1, LINUX_MAX_RT_PRIO - 1] to * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down). */ sched_param.sched_priority = (sched_param.sched_priority - 1) * (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) / (LINUX_MAX_RT_PRIO - 1); break; } } error = kern_sched_setparam(td, tdt, &sched_param); out: PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error, policy; tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); if (error) { PROC_UNLOCK(tdt->td_proc); return (error); } if (linux_map_sched_prio) { error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); if (error) return (error); switch (policy) { case SCHED_OTHER: sched_param.sched_priority = 0; break; case SCHED_FIFO: case SCHED_RR: /* * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to * [1, LINUX_MAX_RT_PRIO - 1] (rounding up). */ sched_param.sched_priority = (sched_param.sched_priority * (LINUX_MAX_RT_PRIO - 1) + (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) / (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1; break; } } else PROC_UNLOCK(tdt->td_proc); error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { struct thread *tdt; cpuset_t *mask; size_t size; int error; id_t tid; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); tid = tdt->td_tid; PROC_UNLOCK(tdt->td_proc); mask = malloc(sizeof(cpuset_t), M_LINUX, M_WAITOK | M_ZERO); size = min(args->len, sizeof(cpuset_t)); error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, size, mask); if (error == ERANGE) error = EINVAL; if (error == 0) error = copyout(mask, args->user_mask_ptr, size); if (error == 0) td->td_retval[0] = size; free(mask, M_LINUX); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct thread *tdt; cpuset_t *mask; int cpu, error; size_t len; id_t tid; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); tid = tdt->td_tid; PROC_UNLOCK(tdt->td_proc); len = min(args->len, sizeof(cpuset_t)); mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);; error = copyin(args->user_mask_ptr, mask, len); if (error != 0) goto out; /* Linux ignore high bits */ CPU_FOREACH_ISSET(cpu, mask) if (cpu > mp_maxid) CPU_CLR(cpu, mask); error = kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tid, mask); if (error == EDEADLK) error = EINVAL; out: free(mask, M_TEMP); return (error); } struct linux_rlimit64 { uint64_t rlim_cur; uint64_t rlim_max; }; int linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args) { struct rlimit rlim, nrlim; struct linux_rlimit64 lrlim; struct proc *p; u_int which; int flags; int error; if (args->new == NULL && args->old != NULL) { if (linux_get_dummy_limit(args->resource, &rlim)) { lrlim.rlim_cur = rlim.rlim_cur; lrlim.rlim_max = rlim.rlim_max; return (copyout(&lrlim, args->old, sizeof(lrlim))); } } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); if (args->new != NULL) { /* * Note. Unlike FreeBSD where rlim is signed 64-bit Linux * rlim is unsigned 64-bit. FreeBSD treats negative limits * as INFINITY so we do not need a conversion even. */ error = copyin(args->new, &nrlim, sizeof(nrlim)); if (error != 0) return (error); } flags = PGET_HOLD | PGET_NOTWEXIT; if (args->new != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; if (args->pid == 0) { p = td->td_proc; PHOLD(p); } else { error = pget(args->pid, flags, &p); if (error != 0) return (error); } if (args->old != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur == RLIM_INFINITY) lrlim.rlim_cur = LINUX_RLIM_INFINITY; else lrlim.rlim_cur = rlim.rlim_cur; if (rlim.rlim_max == RLIM_INFINITY) lrlim.rlim_max = LINUX_RLIM_INFINITY; else lrlim.rlim_max = rlim.rlim_max; error = copyout(&lrlim, args->old, sizeof(lrlim)); if (error != 0) goto out; } if (args->new != NULL) error = kern_proc_setrlimit(td, p, which, &nrlim); out: PRELE(p); return (error); } int linux_pselect6(struct thread *td, struct linux_pselect6_args *args) { struct timespec ts, *tsp; int error; if (args->tsp != NULL) { error = linux_get_timespec(&ts, args->tsp); if (error != 0) return (error); tsp = &ts; } else tsp = NULL; error = linux_common_pselect6(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tsp, args->sig); if (args->tsp != NULL) linux_put_timespec(&ts, args->tsp); return (error); } static int linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds, l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp, l_uintptr_t *sig) { struct timeval utv, tv0, tv1, *tvp; struct l_pselect6arg lpse6; sigset_t *ssp; sigset_t ss; int error; ssp = NULL; if (sig != NULL) { error = copyin(sig, &lpse6, sizeof(lpse6)); if (error != 0) return (error); error = linux_copyin_sigset(td, PTRIN(lpse6.ss), lpse6.ss_len, &ss, &ssp); if (error != 0) return (error); } else ssp = NULL; /* * Currently glibc changes nanosecond number to microsecond. * This mean losing precision but for now it is hardly seen. */ if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&utv, tsp); if (itimerfix(&utv)) return (EINVAL); microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_pselect(td, nfds, readfds, writefds, exceptfds, tvp, ssp, LINUX_NFDBITS); if (tsp != NULL) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); TIMEVAL_TO_TIMESPEC(&utv, tsp); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_pselect6_time64(struct thread *td, struct linux_pselect6_time64_args *args) { struct timespec ts, *tsp; int error; if (args->tsp != NULL) { error = linux_get_timespec64(&ts, args->tsp); if (error != 0) return (error); tsp = &ts; } else tsp = NULL; error = linux_common_pselect6(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tsp, args->sig); if (args->tsp != NULL) linux_put_timespec64(&ts, args->tsp); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_ppoll(struct thread *td, struct linux_ppoll_args *args) { struct timespec uts, *tsp; int error; if (args->tsp != NULL) { error = linux_get_timespec(&uts, args->tsp); if (error != 0) return (error); tsp = &uts; } else tsp = NULL; error = linux_common_ppoll(td, args->fds, args->nfds, tsp, args->sset, args->ssize); if (error == 0 && args->tsp != NULL) error = linux_put_timespec(&uts, args->tsp); return (error); } static int linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds, struct timespec *tsp, l_sigset_t *sset, l_size_t ssize) { struct timespec ts0, ts1; struct pollfd stackfds[32]; struct pollfd *kfds; sigset_t *ssp; sigset_t ss; int error; if (kern_poll_maxfds(nfds)) return (EINVAL); if (sset != NULL) { error = linux_copyin_sigset(td, sset, ssize, &ss, &ssp); if (error != 0) return (error); } else ssp = NULL; if (tsp != NULL) nanotime(&ts0); if (nfds > nitems(stackfds)) kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); else kfds = stackfds; error = linux_pollin(td, kfds, fds, nfds); if (error != 0) goto out; error = kern_poll_kfds(td, kfds, nfds, tsp, ssp); if (error == 0) error = linux_pollout(td, kfds, fds, nfds); if (error == 0 && tsp != NULL) { if (td->td_retval[0]) { nanotime(&ts1); timespecsub(&ts1, &ts0, &ts1); timespecsub(tsp, &ts1, tsp); if (tsp->tv_sec < 0) timespecclear(tsp); } else timespecclear(tsp); } out: if (nfds > nitems(stackfds)) free(kfds, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args) { struct timespec uts, *tsp; int error; if (args->tsp != NULL) { error = linux_get_timespec64(&uts, args->tsp); if (error != 0) return (error); tsp = &uts; } else tsp = NULL; error = linux_common_ppoll(td, args->fds, args->nfds, tsp, args->sset, args->ssize); if (error == 0 && args->tsp != NULL) error = linux_put_timespec64(&uts, args->tsp); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ static int linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) { int error; u_int i; error = copyin(ufds, fds, nfd * sizeof(*fds)); if (error != 0) return (error); for (i = 0; i < nfd; i++) { if (fds->events != 0) linux_to_bsd_poll_events(td, fds->fd, fds->events, &fds->events); fds++; } return (0); } static int linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) { int error = 0; u_int i, n = 0; for (i = 0; i < nfd; i++) { if (fds->revents != 0) { bsd_to_linux_poll_events(fds->revents, &fds->revents); n++; } error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); fds++; ufds++; } td->td_retval[0] = n; return (0); } static int linux_sched_rr_get_interval_common(struct thread *td, pid_t pid, struct timespec *ts) { struct thread *tdt; int error; /* * According to man in case the invalid pid specified * EINVAL should be returned. */ if (pid < 0) return (EINVAL); tdt = linux_tdfind(td, pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, ts); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; int error; error = linux_sched_rr_get_interval_common(td, uap->pid, &ts); if (error != 0) return (error); return (linux_put_timespec(&ts, uap->interval)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_sched_rr_get_interval_time64(struct thread *td, struct linux_sched_rr_get_interval_time64_args *uap) { struct timespec ts; int error; error = linux_sched_rr_get_interval_common(td, uap->pid, &ts); if (error != 0) return (error); return (linux_put_timespec64(&ts, uap->interval)); } #endif /* * In case when the Linux thread is the initial thread in * the thread group thread id is equal to the process id. * Glibc depends on this magic (assert in pthread_getattr_np.c). */ struct thread * linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) { struct linux_emuldata *em; struct thread *tdt; struct proc *p; tdt = NULL; if (tid == 0 || tid == td->td_tid) { if (pid != -1 && td->td_proc->p_pid != pid) return (NULL); PROC_LOCK(td->td_proc); return (td); } else if (tid > PID_MAX) return (tdfind(tid, pid)); /* * Initial thread where the tid equal to the pid. */ p = pfind(tid); if (p != NULL) { if (SV_PROC_ABI(p) != SV_ABI_LINUX || (pid != -1 && tid != pid)) { /* * p is not a Linuxulator process. */ PROC_UNLOCK(p); return (NULL); } FOREACH_THREAD_IN_PROC(p, tdt) { em = em_find(tdt); if (tid == em->em_tid) return (tdt); } PROC_UNLOCK(p); } return (NULL); } void linux_to_bsd_waitopts(int options, int *bsdopts) { if (options & LINUX_WNOHANG) *bsdopts |= WNOHANG; if (options & LINUX_WUNTRACED) *bsdopts |= WUNTRACED; if (options & LINUX_WEXITED) *bsdopts |= WEXITED; if (options & LINUX_WCONTINUED) *bsdopts |= WCONTINUED; if (options & LINUX_WNOWAIT) *bsdopts |= WNOWAIT; if (options & __WCLONE) *bsdopts |= WLINUXCLONE; } int linux_getrandom(struct thread *td, struct linux_getrandom_args *args) { struct uio uio; struct iovec iov; int error; if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM)) return (EINVAL); if (args->count > INT_MAX) args->count = INT_MAX; iov.iov_base = args->buf; iov.iov_len = args->count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = iov.iov_len; uio.uio_segflg = UIO_USERSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK); if (error == 0) td->td_retval[0] = args->count - uio.uio_resid; return (error); } int linux_mincore(struct thread *td, struct linux_mincore_args *args) { /* Needs to be page-aligned */ if (args->start & PAGE_MASK) return (EINVAL); return (kern_mincore(td, args->start, args->len, args->vec)); } #define SYSLOG_TAG "<6>" int linux_syslog(struct thread *td, struct linux_syslog_args *args) { char buf[128], *src, *dst; u_int seq; int buflen, error; if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) { linux_msg(td, "syslog unsupported type 0x%x", args->type); return (EINVAL); } if (args->len < 6) { td->td_retval[0] = 0; return (0); } error = priv_check(td, PRIV_MSGBUF); if (error) return (error); mtx_lock(&msgbuf_lock); msgbuf_peekbytes(msgbufp, NULL, 0, &seq); mtx_unlock(&msgbuf_lock); dst = args->buf; error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG)); /* The -1 is to skip the trailing '\0'. */ dst += sizeof(SYSLOG_TAG) - 1; while (error == 0) { mtx_lock(&msgbuf_lock); buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq); mtx_unlock(&msgbuf_lock); if (buflen == 0) break; for (src = buf; src < buf + buflen && error == 0; src++) { if (*src == '\0') continue; if (dst >= args->buf + args->len) goto out; error = copyout(src, dst, 1); dst++; if (*src == '\n' && *(src + 1) != '<' && dst + sizeof(SYSLOG_TAG) < args->buf + args->len) { error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG)); dst += sizeof(SYSLOG_TAG) - 1; } } } out: td->td_retval[0] = dst - args->buf; return (error); } int linux_getcpu(struct thread *td, struct linux_getcpu_args *args) { int cpu, error, node; cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */ error = 0; node = cpuid_to_pcpu[cpu]->pc_domain; if (args->cpu != NULL) error = copyout(&cpu, args->cpu, sizeof(l_int)); if (args->node != NULL) error = copyout(&node, args->node, sizeof(l_int)); return (error); } #if defined(__i386__) || defined(__amd64__) int linux_poll(struct thread *td, struct linux_poll_args *args) { struct timespec ts, *tsp; if (args->timeout != INFTIM) { if (args->timeout < 0) return (EINVAL); ts.tv_sec = args->timeout / 1000; ts.tv_nsec = (args->timeout % 1000) * 1000000; tsp = &ts; } else tsp = NULL; return (linux_common_ppoll(td, args->fds, args->nfds, tsp, NULL, 0)); } #endif /* __i386__ || __amd64__ */ int linux_seccomp(struct thread *td, struct linux_seccomp_args *args) { switch (args->op) { case LINUX_SECCOMP_GET_ACTION_AVAIL: return (EOPNOTSUPP); default: /* * Ignore unknown operations, just like Linux kernel built * without CONFIG_SECCOMP. */ return (EINVAL); } } #ifndef COMPAT_LINUX32 int linux_execve(struct thread *td, struct linux_execve_args *args) { struct image_args eargs; - char *path; int error; LINUX_CTR(execve); - if (!LUSECONVPATH(td)) { - error = exec_copyin_args(&eargs, args->path, UIO_USERSPACE, - args->argp, args->envp); - } else { - LCONVPATHEXIST(args->path, &path); - error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, - args->envp); - LFREEPATH(path); - } + error = exec_copyin_args(&eargs, args->path, UIO_USERSPACE, + args->argp, args->envp); if (error == 0) error = linux_common_execve(td, &eargs); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #endif diff --git a/sys/compat/linux/linux_stats.c b/sys/compat/linux/linux_stats.c index 0de3438fda24..bdf3e320efcf 100644 --- a/sys/compat/linux/linux_stats.c +++ b/sys/compat/linux/linux_stats.c @@ -1,795 +1,716 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include static int linux_kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct vnode *vp; struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (__predict_false(error != 0)) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred); if (error == 0 && (vp = fp->f_vnode) != NULL) translate_vnhook_major_minor(vp, sbp); fdrop(fp, td); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } static int linux_kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) { if (error == ENOTDIR && (nd.ni_resflags & NIRES_EMPTYPATH) != 0) error = linux_kern_fstat(td, fd, sbp); return (error); } error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED); if (error == 0) translate_vnhook_major_minor(nd.ni_vp, sbp); NDFREE_PNBUF(&nd); vput(nd.ni_vp); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #ifdef LINUX_LEGACY_SYSCALLS static int linux_kern_stat(struct thread *td, const char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp)); } static int linux_kern_lstat(struct thread *td, const char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg, sbp)); } #endif static int newstat_copyout(struct stat *buf, void *ubuf) { struct l_newstat tbuf; bzero(&tbuf, sizeof(tbuf)); tbuf.st_dev = linux_new_encode_dev(buf->st_dev); tbuf.st_ino = buf->st_ino; tbuf.st_mode = buf->st_mode; tbuf.st_nlink = buf->st_nlink; tbuf.st_uid = buf->st_uid; tbuf.st_gid = buf->st_gid; tbuf.st_rdev = linux_new_encode_dev(buf->st_rdev); tbuf.st_size = buf->st_size; tbuf.st_atim.tv_sec = buf->st_atim.tv_sec; tbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; tbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; tbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; tbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; tbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; tbuf.st_blksize = buf->st_blksize; tbuf.st_blocks = buf->st_blocks; return (copyout(&tbuf, ubuf, sizeof(tbuf))); } #ifdef LINUX_LEGACY_SYSCALLS int linux_newstat(struct thread *td, struct linux_newstat_args *args) { struct stat buf; - char *path; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST(args->path, &path); - error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); if (error) return (error); return (newstat_copyout(&buf, args->buf)); } int linux_newlstat(struct thread *td, struct linux_newlstat_args *args) { struct stat sb; - char *path; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &sb); - } else { - LCONVPATHEXIST(args->path, &path); - error = linux_kern_lstat(td, path, UIO_SYSSPACE, &sb); - LFREEPATH(path); - } + error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &sb); if (error) return (error); return (newstat_copyout(&sb, args->buf)); } #endif int linux_newfstat(struct thread *td, struct linux_newfstat_args *args) { struct stat buf; int error; error = linux_kern_fstat(td, args->fd, &buf); if (!error) error = newstat_copyout(&buf, args->buf); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static __inline uint16_t linux_old_encode_dev(dev_t _dev) { return (_dev == NODEV ? 0 : linux_encode_dev(major(_dev), minor(_dev))); } static int old_stat_copyout(struct stat *buf, void *ubuf) { struct l_old_stat lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = linux_old_encode_dev(buf->st_dev); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = linux_old_encode_dev(buf->st_rdev); lbuf.st_size = MIN(buf->st_size, INT32_MAX); lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; lbuf.st_flags = buf->st_flags; lbuf.st_gen = buf->st_gen; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat(struct thread *td, struct linux_stat_args *args) { struct stat buf; - char *path; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST(args->path, &path); - error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); if (error) { return (error); } return (old_stat_copyout(&buf, args->up)); } int linux_lstat(struct thread *td, struct linux_lstat_args *args) { struct stat buf; - char *path; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST(args->path, &path); - error = linux_kern_lstat(td, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &buf); if (error) { return (error); } return (old_stat_copyout(&buf, args->up)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ struct l_statfs { l_long f_type; l_long f_bsize; l_long f_blocks; l_long f_bfree; l_long f_bavail; l_long f_files; l_long f_ffree; l_fsid_t f_fsid; l_long f_namelen; l_long f_frsize; l_long f_flags; l_long f_spare[4]; }; #define LINUX_CODA_SUPER_MAGIC 0x73757245L #define LINUX_EXT2_SUPER_MAGIC 0xEF53L #define LINUX_HPFS_SUPER_MAGIC 0xf995e849L #define LINUX_ISOFS_SUPER_MAGIC 0x9660L #define LINUX_MSDOS_SUPER_MAGIC 0x4d44L #define LINUX_NCP_SUPER_MAGIC 0x564cL #define LINUX_NFS_SUPER_MAGIC 0x6969L #define LINUX_NTFS_SUPER_MAGIC 0x5346544EL #define LINUX_PROC_SUPER_MAGIC 0x9fa0L #define LINUX_UFS_SUPER_MAGIC 0x00011954L /* XXX - UFS_MAGIC in Linux */ #define LINUX_ZFS_SUPER_MAGIC 0x2FC12FC1 #define LINUX_DEVFS_SUPER_MAGIC 0x1373L #define LINUX_SHMFS_MAGIC 0x01021994 static long bsd_to_linux_ftype(const char *fstypename) { int i; static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = { {"ufs", LINUX_UFS_SUPER_MAGIC}, {"zfs", LINUX_ZFS_SUPER_MAGIC}, {"cd9660", LINUX_ISOFS_SUPER_MAGIC}, {"nfs", LINUX_NFS_SUPER_MAGIC}, {"ext2fs", LINUX_EXT2_SUPER_MAGIC}, {"procfs", LINUX_PROC_SUPER_MAGIC}, {"msdosfs", LINUX_MSDOS_SUPER_MAGIC}, {"ntfs", LINUX_NTFS_SUPER_MAGIC}, {"nwfs", LINUX_NCP_SUPER_MAGIC}, {"hpfs", LINUX_HPFS_SUPER_MAGIC}, {"coda", LINUX_CODA_SUPER_MAGIC}, {"devfs", LINUX_DEVFS_SUPER_MAGIC}, {"tmpfs", LINUX_SHMFS_MAGIC}, {NULL, 0L}}; for (i = 0; b2l_tbl[i].bsd_name != NULL; i++) if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0) return (b2l_tbl[i].linux_type); return (0L); } static int bsd_to_linux_mnt_flags(int f_flags) { int flags = LINUX_ST_VALID; if (f_flags & MNT_RDONLY) flags |= LINUX_ST_RDONLY; if (f_flags & MNT_NOEXEC) flags |= LINUX_ST_NOEXEC; if (f_flags & MNT_NOSUID) flags |= LINUX_ST_NOSUID; if (f_flags & MNT_NOATIME) flags |= LINUX_ST_NOATIME; if (f_flags & MNT_NOSYMFOLLOW) flags |= LINUX_ST_NOSYMFOLLOW; if (f_flags & MNT_SYNCHRONOUS) flags |= LINUX_ST_SYNCHRONOUS; return (flags); } static int bsd_to_linux_statfs(struct statfs *bsd_statfs, struct l_statfs *linux_statfs) { #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) statfs_scale_blocks(bsd_statfs, INT32_MAX); #endif linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) linux_statfs->f_ffree = MIN(bsd_statfs->f_ffree, INT32_MAX); linux_statfs->f_files = MIN(bsd_statfs->f_files, INT32_MAX); #else linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; #endif linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = bsd_to_linux_mnt_flags(bsd_statfs->f_flags); memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); return (0); } int linux_statfs(struct thread *td, struct linux_statfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; - char *path; int error; - if (!LUSECONVPATH(td)) { - bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); - error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); - } else { - LCONVPATHEXIST(args->path, &path); - bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); - error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); - LFREEPATH(path); - } + bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static void bsd_to_linux_statfs64(struct statfs *bsd_statfs, struct l_statfs64 *linux_statfs) { linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = bsd_to_linux_mnt_flags(bsd_statfs->f_flags); memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); } int linux_statfs64(struct thread *td, struct linux_statfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; - char *path; int error; if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); - if (!LUSECONVPATH(td)) { - bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); - error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); - } else { - LCONVPATHEXIST(args->path, &path); - bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); - error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); - LFREEPATH(path); - } + bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } int linux_fstatfs64(struct thread *td, struct linux_fstatfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; int error; if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; int error; bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } struct l_ustat { l_daddr_t f_tfree; l_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; #ifdef LINUX_LEGACY_SYSCALLS int linux_ustat(struct thread *td, struct linux_ustat_args *args) { return (EOPNOTSUPP); } #endif #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat64_copyout(struct stat *buf, void *ubuf) { struct l_stat64 lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = linux_new_encode_dev(buf->st_dev); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = linux_new_encode_dev(buf->st_rdev); lbuf.st_size = buf->st_size; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; /* * The __st_ino field makes all the difference. In the Linux kernel * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO, * but without the assignment to __st_ino the runtime linker refuses * to mmap(2) any shared libraries. I guess it's broken alright :-) */ lbuf.__st_ino = buf->st_ino; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat64(struct thread *td, struct linux_stat64_args *args) { struct stat buf; - char *filename; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_stat(td, args->filename, UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST(args->filename, &filename); - error = linux_kern_stat(td, filename, UIO_SYSSPACE, &buf); - LFREEPATH(filename); - } + error = linux_kern_stat(td, args->filename, UIO_USERSPACE, &buf); if (error) return (error); return (stat64_copyout(&buf, args->statbuf)); } int linux_lstat64(struct thread *td, struct linux_lstat64_args *args) { struct stat sb; - char *filename; int error; - if (!LUSECONVPATH(td)) { - error = linux_kern_lstat(td, args->filename, UIO_USERSPACE, &sb); - } else { - LCONVPATHEXIST(args->filename, &filename); - error = linux_kern_lstat(td, filename, UIO_SYSSPACE, &sb); - LFREEPATH(filename); - } + error = linux_kern_lstat(td, args->filename, UIO_USERSPACE, &sb); if (error) return (error); return (stat64_copyout(&sb, args->statbuf)); } int linux_fstat64(struct thread *td, struct linux_fstat64_args *args) { struct stat buf; int error; error = linux_kern_fstat(td, args->fd, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); return (error); } int linux_fstatat64(struct thread *td, struct linux_fstatat64_args *args) { - char *path; int error, dfd, flag, unsupported; struct stat buf; unsupported = args->flag & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH); if (unsupported != 0) { linux_msg(td, "fstatat64 unsupported flag 0x%x", unsupported); return (EINVAL); } flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; flag |= (args->flag & LINUX_AT_EMPTY_PATH) ? AT_EMPTY_PATH : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - error = linux_kern_statat(td, flag, dfd, args->pathname, - UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST_AT(args->pathname, &path, dfd); - error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_statat(td, flag, dfd, args->pathname, + UIO_USERSPACE, &buf); if (error == 0) error = stat64_copyout(&buf, args->statbuf); return (error); } #else /* __amd64__ && !COMPAT_LINUX32 */ int linux_newfstatat(struct thread *td, struct linux_newfstatat_args *args) { - char *path; int error, dfd, flag, unsupported; struct stat buf; unsupported = args->flag & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH); if (unsupported != 0) { linux_msg(td, "fstatat unsupported flag 0x%x", unsupported); return (EINVAL); } flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; flag |= (args->flag & LINUX_AT_EMPTY_PATH) ? AT_EMPTY_PATH : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; - if (!LUSECONVPATH(td)) { - error = linux_kern_statat(td, flag, dfd, args->pathname, - UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST_AT(args->pathname, &path, dfd); - error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_statat(td, flag, dfd, args->pathname, + UIO_USERSPACE, &buf); if (error == 0) error = newstat_copyout(&buf, args->statbuf); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_syncfs(struct thread *td, struct linux_syncfs_args *args) { struct mount *mp; struct vnode *vp; int error, save; error = fgetvp(td, args->fd, &cap_fsync_rights, &vp); if (error != 0) /* * Linux syncfs() returns only EBADF, however fgetvp() * can return EINVAL in case of file descriptor does * not represent a vnode. XXX. */ return (error); mp = vp->v_mount; mtx_lock(&mountlist_mtx); error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error != 0) { /* See comment above. */ mtx_unlock(&mountlist_mtx); goto out; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } vfs_unbusy(mp); out: vrele(vp); return (error); } static int statx_copyout(struct stat *buf, void *ubuf) { struct l_statx tbuf; bzero(&tbuf, sizeof(tbuf)); tbuf.stx_mask = STATX_ALL; tbuf.stx_blksize = buf->st_blksize; tbuf.stx_attributes = 0; tbuf.stx_nlink = buf->st_nlink; tbuf.stx_uid = buf->st_uid; tbuf.stx_gid = buf->st_gid; tbuf.stx_mode = buf->st_mode; tbuf.stx_ino = buf->st_ino; tbuf.stx_size = buf->st_size; tbuf.stx_blocks = buf->st_blocks; tbuf.stx_atime.tv_sec = buf->st_atim.tv_sec; tbuf.stx_atime.tv_nsec = buf->st_atim.tv_nsec; tbuf.stx_btime.tv_sec = buf->st_birthtim.tv_sec; tbuf.stx_btime.tv_nsec = buf->st_birthtim.tv_nsec; tbuf.stx_ctime.tv_sec = buf->st_ctim.tv_sec; tbuf.stx_ctime.tv_nsec = buf->st_ctim.tv_nsec; tbuf.stx_mtime.tv_sec = buf->st_mtim.tv_sec; tbuf.stx_mtime.tv_nsec = buf->st_mtim.tv_nsec; tbuf.stx_rdev_major = linux_encode_major(buf->st_rdev); tbuf.stx_rdev_minor = linux_encode_minor(buf->st_rdev); tbuf.stx_dev_major = linux_encode_major(buf->st_dev); tbuf.stx_dev_minor = linux_encode_minor(buf->st_dev); return (copyout(&tbuf, ubuf, sizeof(tbuf))); } int linux_statx(struct thread *td, struct linux_statx_args *args) { - char *path; int error, dirfd, flags, unsupported; struct stat buf; unsupported = args->flags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH | LINUX_AT_NO_AUTOMOUNT); if (unsupported != 0) { linux_msg(td, "statx unsupported flags 0x%x", unsupported); return (EINVAL); } flags = (args->flags & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; flags |= (args->flags & LINUX_AT_EMPTY_PATH) ? AT_EMPTY_PATH : 0; dirfd = (args->dirfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dirfd; - if (!LUSECONVPATH(td)) { - error = linux_kern_statat(td, flags, dirfd, args->pathname, - UIO_USERSPACE, &buf); - } else { - LCONVPATHEXIST_AT(args->pathname, &path, dirfd); - error = linux_kern_statat(td, flags, dirfd, path, UIO_SYSSPACE, &buf); - LFREEPATH(path); - } + error = linux_kern_statat(td, flags, dirfd, args->pathname, + UIO_USERSPACE, &buf); if (error == 0) error = statx_copyout(&buf, args->statxbuf); return (error); } diff --git a/sys/compat/linux/linux_uid16.c b/sys/compat/linux/linux_uid16.c index 181ecd2c7e12..7b7dc8e0f0bb 100644 --- a/sys/compat/linux/linux_uid16.c +++ b/sys/compat/linux/linux_uid16.c @@ -1,348 +1,313 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE1(uid16, linux_chown16, conv_path, "char *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_lchown16, conv_path, "char *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, priv_check_cred_error, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_getgroups16, copyout_error, "int"); DUMMY(setfsuid16); DUMMY(setfsgid16); DUMMY(getresuid16); DUMMY(getresgid16); #define CAST_NOCHG(x) ((x == 0xFFFF) ? -1 : x) int linux_chown16(struct thread *td, struct linux_chown16_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td) && !SDT_PROBES_ENABLED()) { - error = kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, - CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), 0); - } else { - LCONVPATHEXIST(args->path, &path); - /* - * The DTrace probes have to be after the LCONVPATHEXIST, as - * LCONVPATHEXIST may return on its own and we do not want to - * have a stray entry without the corresponding return. - */ - LIN_SDT_PROBE1(uid16, linux_chown16, conv_path, path); - - error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, - CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), 0); - LFREEPATH(path); - } - return (error); + return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, + CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), 0)); } int linux_lchown16(struct thread *td, struct linux_lchown16_args *args) { - char *path; - int error; - if (!LUSECONVPATH(td) && !SDT_PROBES_ENABLED()) { - error = kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, - CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), AT_SYMLINK_NOFOLLOW); - } else { - LCONVPATHEXIST(args->path, &path); - - /* - * The DTrace probes have to be after the LCONVPATHEXIST, as - * LCONVPATHEXIST may return on its own and we do not want to - * have a stray entry without the corresponding return. - */ - LIN_SDT_PROBE1(uid16, linux_lchown16, conv_path, path); - - error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, - CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), AT_SYMLINK_NOFOLLOW); - LFREEPATH(path); - } - return (error); + return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, + CAST_NOCHG(args->uid), CAST_NOCHG(args->gid), AT_SYMLINK_NOFOLLOW)); } int linux_setgroups16(struct thread *td, struct linux_setgroups16_args *args) { struct ucred *newcred, *oldcred; l_gid16_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->gidset, linux_gidset, ngrp * sizeof(l_gid16_t)); if (error) { LIN_SDT_PROBE1(uid16, linux_setgroups16, copyin_error, error); free(linux_gidset, M_LINUX); return (error); } newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = crcopysafe(p, newcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) { PROC_UNLOCK(p); crfree(newcred); LIN_SDT_PROBE1(uid16, linux_setgroups16, priv_check_cred_error, error); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(td->td_proc); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); return (error); } int linux_getgroups16(struct thread *td, struct linux_getgroups16_args *args) { struct ucred *cred; l_gid16_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->gidset, ngrp * sizeof(l_gid16_t)); free(linux_gidset, M_LINUX); if (error) { LIN_SDT_PROBE1(uid16, linux_getgroups16, copyout_error, error); return (error); } td->td_retval[0] = ngrp; return (0); } int linux_getgid16(struct thread *td, struct linux_getgid16_args *args) { td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid16(struct thread *td, struct linux_getuid16_args *args) { td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getegid16(struct thread *td, struct linux_getegid16_args *args) { struct getegid_args bsd; int error; error = sys_getegid(td, &bsd); return (error); } int linux_geteuid16(struct thread *td, struct linux_geteuid16_args *args) { struct geteuid_args bsd; int error; error = sys_geteuid(td, &bsd); return (error); } int linux_setgid16(struct thread *td, struct linux_setgid16_args *args) { struct setgid_args bsd; int error; bsd.gid = args->gid; error = sys_setgid(td, &bsd); return (error); } int linux_setuid16(struct thread *td, struct linux_setuid16_args *args) { struct setuid_args bsd; int error; bsd.uid = args->uid; error = sys_setuid(td, &bsd); return (error); } int linux_setregid16(struct thread *td, struct linux_setregid16_args *args) { struct setregid_args bsd; int error; bsd.rgid = CAST_NOCHG(args->rgid); bsd.egid = CAST_NOCHG(args->egid); error = sys_setregid(td, &bsd); return (error); } int linux_setreuid16(struct thread *td, struct linux_setreuid16_args *args) { struct setreuid_args bsd; int error; bsd.ruid = CAST_NOCHG(args->ruid); bsd.euid = CAST_NOCHG(args->euid); error = sys_setreuid(td, &bsd); return (error); } int linux_setresgid16(struct thread *td, struct linux_setresgid16_args *args) { struct setresgid_args bsd; int error; bsd.rgid = CAST_NOCHG(args->rgid); bsd.egid = CAST_NOCHG(args->egid); bsd.sgid = CAST_NOCHG(args->sgid); error = sys_setresgid(td, &bsd); return (error); } int linux_setresuid16(struct thread *td, struct linux_setresuid16_args *args) { struct setresuid_args bsd; int error; bsd.ruid = CAST_NOCHG(args->ruid); bsd.euid = CAST_NOCHG(args->euid); bsd.suid = CAST_NOCHG(args->suid); error = sys_setresuid(td, &bsd); return (error); } diff --git a/sys/compat/linux/linux_util.c b/sys/compat/linux/linux_util.c index afb0bb082dd5..4431bdd479c7 100644 --- a/sys/compat/linux/linux_util.c +++ b/sys/compat/linux/linux_util.c @@ -1,339 +1,354 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1995 Frank van der Linden * Copyright (c) 1995 Scott Bartram * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: svr4_util.c,v 1.5 1995/01/22 23:44:50 christos Exp */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); MALLOC_DEFINE(M_EPOLL, "lepoll", "Linux events structures"); FEATURE(linuxulator_v4l, "V4L ioctl wrapper support in the linuxulator"); FEATURE(linuxulator_v4l2, "V4L2 ioctl wrapper support in the linuxulator"); /** * Special DTrace provider for the linuxulator. * * In this file we define the provider for the entire linuxulator. All * modules (= files of the linuxulator) use it. * * We define a different name depending on the emulated bitsize, see * ../..//linux{,32}/linux.h, e.g.: * native bitsize = linuxulator * amd64, 32bit emulation = linuxulator32 */ LIN_SDT_PROVIDER_DEFINE(linuxulator); LIN_SDT_PROVIDER_DEFINE(linuxulator32); char linux_emul_path[MAXPATHLEN] = "/compat/linux"; SYSCTL_STRING(_compat_linux, OID_AUTO, emul_path, CTLFLAG_RWTUN, linux_emul_path, sizeof(linux_emul_path), "Linux runtime environment path"); -/* - * Search an alternate path before passing pathname arguments on to - * system calls. Useful for keeping a separate 'emulation tree'. - * - * If cflag is set, we check if an attempt can be made to create the - * named file, i.e. we check if the directory it should be in exists. - */ int -linux_emul_convpath(const char *path, enum uio_seg pathseg, - char **pbuf, int cflag, int dfd) +linux_pwd_onexec(struct thread *td) { - int retval; + struct nameidata nd; + struct pwd *pwd; + int error; - retval = kern_alternate_path(linux_emul_path, path, pathseg, pbuf, - cflag, dfd); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path); + error = namei(&nd); + if (error != 0) { + /* + * Do not bother if we are in chroot or jail. + */ + pwd = pwd_hold(td); + if (pwd->pwd_rdir != rootvnode) { + pwd_drop(pwd); + return (0); + } + pwd_drop(pwd); + return (error); + } + NDFREE_PNBUF(&nd); + pwd_altroot(td, nd.ni_vp); + vrele(nd.ni_vp); + return (0); +} + +void +linux_pwd_onexec_native(struct thread *td) +{ - return (retval); + pwd_altroot(td, NULL); } void linux_msg(const struct thread *td, const char *fmt, ...) { va_list ap; struct proc *p; if (linux_debug == 0) return; p = td->td_proc; printf("linux: jid %d pid %d (%s): ", p->p_ucred->cr_prison->pr_id, (int)p->p_pid, p->p_comm); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } struct device_element { TAILQ_ENTRY(device_element) list; struct linux_device_handler entry; }; static TAILQ_HEAD(, device_element) devices = TAILQ_HEAD_INITIALIZER(devices); static struct linux_device_handler null_handler = { "mem", "mem", "null", "null", 1, 3, 1}; DATA_SET(linux_device_handler_set, null_handler); char * linux_driver_get_name_dev(device_t dev) { struct device_element *de; const char *device_name = device_get_name(dev); if (device_name == NULL) return (NULL); TAILQ_FOREACH(de, &devices, list) { if (strcmp(device_name, de->entry.bsd_driver_name) == 0) return (de->entry.linux_driver_name); } return (NULL); } int linux_driver_get_major_minor(const char *node, int *major, int *minor) { struct device_element *de; unsigned long devno; size_t sz; if (node == NULL || major == NULL || minor == NULL) return (1); sz = sizeof("pts/") - 1; if (strncmp(node, "pts/", sz) == 0 && node[sz] != '\0') { /* * Linux checks major and minors of the slave device * to make sure it's a pty device, so let's make him * believe it is. */ devno = strtoul(node + sz, NULL, 10); *major = 136 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/card") - 1; if (strncmp(node, "dri/card", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/controlD") - 1; if (strncmp(node, "dri/controlD", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/renderD") - 1; if (strncmp(node, "dri/renderD", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("drm/") - 1; if (strncmp(node, "drm/", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } TAILQ_FOREACH(de, &devices, list) { if (strcmp(node, de->entry.bsd_device_name) == 0) { *major = de->entry.linux_major; *minor = de->entry.linux_minor; return (0); } } return (1); } int linux_vn_get_major_minor(const struct vnode *vp, int *major, int *minor) { int error; if (vp->v_type != VCHR) return (ENOTBLK); dev_lock(); if (vp->v_rdev == NULL) { dev_unlock(); return (ENXIO); } error = linux_driver_get_major_minor(devtoname(vp->v_rdev), major, minor); dev_unlock(); return (error); } void translate_vnhook_major_minor(struct vnode *vp, struct stat *sb) { int major, minor; if (vn_isdisk(vp)) { sb->st_mode &= ~S_IFMT; sb->st_mode |= S_IFBLK; } /* * Return the same st_dev for every devfs instance. The reason * for this is to work around an idiosyncrasy of glibc getttynam() * implementation: it checks whether st_dev returned for fd 0 * is the same as st_dev returned for the target of /proc/self/fd/0 * symlink, and with linux chroots having their own devfs instance, * the check will fail if you chroot into it. */ if (rootdevmp != NULL && vp->v_mount->mnt_vfc == rootdevmp->mnt_vfc) sb->st_dev = rootdevmp->mnt_stat.f_fsid.val[0]; if (linux_vn_get_major_minor(vp, &major, &minor) == 0) sb->st_rdev = makedev(major, minor); } char * linux_get_char_devices(void) { struct device_element *de; char *temp, *string, *last; char formated[256]; int current_size = 0, string_size = 1024; string = malloc(string_size, M_LINUX, M_WAITOK); string[0] = '\000'; last = ""; TAILQ_FOREACH(de, &devices, list) { if (!de->entry.linux_char_device) continue; temp = string; if (strcmp(last, de->entry.bsd_driver_name) != 0) { last = de->entry.bsd_driver_name; snprintf(formated, sizeof(formated), "%3d %s\n", de->entry.linux_major, de->entry.linux_device_name); if (strlen(formated) + current_size >= string_size) { string_size *= 2; string = malloc(string_size, M_LINUX, M_WAITOK); bcopy(temp, string, current_size); free(temp, M_LINUX); } strcat(string, formated); current_size = strlen(string); } } return (string); } void linux_free_get_char_devices(char *string) { free(string, M_LINUX); } static int linux_major_starting = 200; int linux_device_register_handler(struct linux_device_handler *d) { struct device_element *de; if (d == NULL) return (EINVAL); de = malloc(sizeof(*de), M_LINUX, M_WAITOK); if (d->linux_major < 0) { d->linux_major = linux_major_starting++; } bcopy(d, &de->entry, sizeof(*d)); /* Add the element to the list, sorted on span. */ TAILQ_INSERT_TAIL(&devices, de, list); return (0); } int linux_device_unregister_handler(struct linux_device_handler *d) { struct device_element *de; if (d == NULL) return (EINVAL); TAILQ_FOREACH(de, &devices, list) { if (bcmp(d, &de->entry, sizeof(*d)) == 0) { TAILQ_REMOVE(&devices, de, list); free(de, M_LINUX); return (0); } } return (EINVAL); } diff --git a/sys/compat/linux/linux_util.h b/sys/compat/linux/linux_util.h index bd364e61e90f..6e2c3d1ed670 100644 --- a/sys/compat/linux/linux_util.h +++ b/sys/compat/linux/linux_util.h @@ -1,202 +1,181 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1995 Frank van der Linden * Copyright (c) 1995 Scott Bartram * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: svr4_util.h,v 1.5 1994/11/18 02:54:31 christos Exp * from: linux_util.h,v 1.2 1995/03/05 23:23:50 fvdl Exp * $FreeBSD$ */ #ifndef _LINUX_UTIL_H_ #define _LINUX_UTIL_H_ #include extern int linux_debug; MALLOC_DECLARE(M_LINUX); MALLOC_DECLARE(M_EPOLL); extern char linux_emul_path[]; -extern int linux_use_emul_path; -int linux_emul_convpath(const char *, enum uio_seg, char **, int, int); - -#define LUSECONVPATH(td) atomic_load_int(&linux_use_emul_path) - -#define LCONVPATH_AT(upath, pathp, i, dfd) \ - do { \ - int _error; \ - \ - _error = linux_emul_convpath(upath, UIO_USERSPACE, \ - pathp, i, dfd); \ - if (*(pathp) == NULL) \ - return (_error); \ - } while (0) - -#define LCONVPATH(upath, pathp, i) \ - LCONVPATH_AT(upath, pathp, i, AT_FDCWD) - -#define LCONVPATHEXIST(upath, pathp) LCONVPATH(upath, pathp, 0) -#define LCONVPATHEXIST_AT(upath, pathp, dfd) LCONVPATH_AT(upath, pathp, 0, dfd) -#define LCONVPATHCREAT(upath, pathp) LCONVPATH(upath, pathp, 1) -#define LCONVPATHCREAT_AT(upath, pathp, dfd) LCONVPATH_AT(upath, pathp, 1, dfd) -#define LFREEPATH(path) free(path, M_TEMP) +int linux_pwd_onexec(struct thread *); +void linux_pwd_onexec_native(struct thread *); #define DUMMY(s) \ LIN_SDT_PROBE_DEFINE0(dummy, s, not_implemented); \ int \ linux_ ## s(struct thread *td, struct linux_ ## s ## _args *args) \ { \ static pid_t pid; \ \ if (pid != td->td_proc->p_pid) { \ linux_msg(td, "syscall %s not implemented", #s); \ LIN_SDT_PROBE0(dummy, s, not_implemented); \ pid = td->td_proc->p_pid; \ }; \ \ return (ENOSYS); \ } \ struct __hack /* * This is for the syscalls that are not even yet implemented in Linux. * * They're marked as UNIMPL in syscall.master so it will * have nosys record in linux_sysent[]. */ #define UNIMPLEMENTED(s) void linux_msg(const struct thread *td, const char *fmt, ...) __printflike(2, 3); struct linux_device_handler { char *bsd_driver_name; char *linux_driver_name; char *bsd_device_name; char *linux_device_name; int linux_major; int linux_minor; int linux_char_device; }; struct stat; int linux_device_register_handler(struct linux_device_handler *h); int linux_device_unregister_handler(struct linux_device_handler *h); char *linux_driver_get_name_dev(device_t dev); int linux_driver_get_major_minor(const char *node, int *major, int *minor); int linux_vn_get_major_minor(const struct vnode *vn, int *major, int *minor); char *linux_get_char_devices(void); void linux_free_get_char_devices(char *string); void translate_vnhook_major_minor(struct vnode *vp, struct stat *sb); #if defined(KTR) #define KTR_LINUX KTR_SUBSYS #define LINUX_CTRFMT(nm, fmt) #nm"("fmt")" #define LINUX_CTR6(f, m, p1, p2, p3, p4, p5, p6) do { \ CTR6(KTR_LINUX, LINUX_CTRFMT(f, m), \ p1, p2, p3, p4, p5, p6); \ } while (0) #define LINUX_CTR(f) LINUX_CTR6(f, "", 0, 0, 0, 0, 0, 0) #define LINUX_CTR0(f, m) LINUX_CTR6(f, m, 0, 0, 0, 0, 0, 0) #define LINUX_CTR1(f, m, p1) LINUX_CTR6(f, m, p1, 0, 0, 0, 0, 0) #define LINUX_CTR2(f, m, p1, p2) LINUX_CTR6(f, m, p1, p2, 0, 0, 0, 0) #define LINUX_CTR3(f, m, p1, p2, p3) LINUX_CTR6(f, m, p1, p2, p3, 0, 0, 0) #define LINUX_CTR4(f, m, p1, p2, p3, p4) LINUX_CTR6(f, m, p1, p2, p3, p4, 0, 0) #define LINUX_CTR5(f, m, p1, p2, p3, p4, p5) LINUX_CTR6(f, m, p1, p2, p3, p4, p5, 0) #else #define LINUX_CTR(f) #define LINUX_CTR0(f, m) #define LINUX_CTR1(f, m, p1) #define LINUX_CTR2(f, m, p1, p2) #define LINUX_CTR3(f, m, p1, p2, p3) #define LINUX_CTR4(f, m, p1, p2, p3, p4) #define LINUX_CTR5(f, m, p1, p2, p3, p4, p5) #define LINUX_CTR6(f, m, p1, p2, p3, p4, p5, p6) #endif /* * Some macros for rate limiting messages: * - noisy if compat.linux.debug = 1 * - print only once if compat.linux.debug > 1 */ #define LINUX_RATELIMIT_MSG_NOTTESTED(_what) \ do { \ static int seen = 0; \ \ if (seen == 0 && linux_debug >= 2) { \ linux_msg(curthread, "%s is not tested, please report on emulation@FreeBSD.org how it works", _what); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #define LINUX_RATELIMIT_MSG(_message) \ do { \ static int seen = 0; \ \ if (seen == 0) { \ linux_msg(curthread, _message); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #define LINUX_RATELIMIT_MSG_OPT1(_message, _opt1) \ do { \ static int seen = 0; \ \ if (seen == 0) { \ linux_msg(curthread, _message, _opt1); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #define LINUX_RATELIMIT_MSG_OPT2(_message, _opt1, _opt2) \ do { \ static int seen = 0; \ \ if (seen == 0) { \ linux_msg(curthread, _message, _opt1, _opt2); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #endif /* ! _LINUX_UTIL_H_ */ diff --git a/sys/i386/linux/linux_machdep.c b/sys/i386/linux/linux_machdep.c index d86cbdb4d816..92b37646d28d 100644 --- a/sys/i386/linux/linux_machdep.c +++ b/sys/i386/linux/linux_machdep.c @@ -1,954 +1,945 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needed for pcb definition in linux_set_thread_area */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct l_descriptor { l_uint entry_number; l_ulong base_addr; l_uint limit; l_uint seg_32bit:1; l_uint contents:2; l_uint read_exec_only:1; l_uint limit_in_pages:1; l_uint seg_not_present:1; l_uint useable:1; }; struct l_old_select_argv { l_int nfds; l_fd_set *readfds; l_fd_set *writefds; l_fd_set *exceptfds; struct l_timeval *timeout; }; struct l_ipc_kludge { struct l_msgbuf *msgp; l_long msgtyp; }; int linux_ipc(struct thread *td, struct linux_ipc_args *args) { switch (args->what & 0xFFFF) { case LINUX_SEMOP: { return (kern_semop(td, args->arg1, PTRIN(args->ptr), args->arg2, NULL)); } case LINUX_SEMGET: { struct linux_semget_args a; a.key = args->arg1; a.nsems = args->arg2; a.semflg = args->arg3; return (linux_semget(td, &a)); } case LINUX_SEMCTL: { struct linux_semctl_args a; int error; a.semid = args->arg1; a.semnum = args->arg2; a.cmd = args->arg3; error = copyin(PTRIN(args->ptr), &a.arg, sizeof(a.arg)); if (error) return (error); return (linux_semctl(td, &a)); } case LINUX_SEMTIMEDOP: { struct linux_semtimedop_args a; a.semid = args->arg1; a.tsops = PTRIN(args->ptr); a.nsops = args->arg2; a.timeout = PTRIN(args->arg5); return (linux_semtimedop(td, &a)); } case LINUX_MSGSND: { struct linux_msgsnd_args a; a.msqid = args->arg1; a.msgp = PTRIN(args->ptr); a.msgsz = args->arg2; a.msgflg = args->arg3; return (linux_msgsnd(td, &a)); } case LINUX_MSGRCV: { struct linux_msgrcv_args a; a.msqid = args->arg1; a.msgsz = args->arg2; a.msgflg = args->arg3; if ((args->what >> 16) == 0) { struct l_ipc_kludge tmp; int error; if (args->ptr == 0) return (EINVAL); error = copyin(PTRIN(args->ptr), &tmp, sizeof(tmp)); if (error) return (error); a.msgp = PTRIN(tmp.msgp); a.msgtyp = tmp.msgtyp; } else { a.msgp = PTRIN(args->ptr); a.msgtyp = args->arg5; } return (linux_msgrcv(td, &a)); } case LINUX_MSGGET: { struct linux_msgget_args a; a.key = args->arg1; a.msgflg = args->arg2; return (linux_msgget(td, &a)); } case LINUX_MSGCTL: { struct linux_msgctl_args a; a.msqid = args->arg1; a.cmd = args->arg2; a.buf = PTRIN(args->ptr); return (linux_msgctl(td, &a)); } case LINUX_SHMAT: { struct linux_shmat_args a; l_uintptr_t addr; int error; a.shmid = args->arg1; a.shmaddr = PTRIN(args->ptr); a.shmflg = args->arg2; error = linux_shmat(td, &a); if (error != 0) return (error); addr = td->td_retval[0]; error = copyout(&addr, PTRIN(args->arg3), sizeof(addr)); td->td_retval[0] = 0; return (error); } case LINUX_SHMDT: { struct linux_shmdt_args a; a.shmaddr = PTRIN(args->ptr); return (linux_shmdt(td, &a)); } case LINUX_SHMGET: { struct linux_shmget_args a; a.key = args->arg1; a.size = args->arg2; a.shmflg = args->arg3; return (linux_shmget(td, &a)); } case LINUX_SHMCTL: { struct linux_shmctl_args a; a.shmid = args->arg1; a.cmd = args->arg2; a.buf = PTRIN(args->ptr); return (linux_shmctl(td, &a)); } default: break; } return (EINVAL); } int linux_old_select(struct thread *td, struct linux_old_select_args *args) { struct l_old_select_argv linux_args; struct linux_select_args newsel; int error; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); newsel.nfds = linux_args.nfds; newsel.readfds = linux_args.readfds; newsel.writefds = linux_args.writefds; newsel.exceptfds = linux_args.exceptfds; newsel.timeout = linux_args.timeout; return (linux_select(td, &newsel)); } int linux_set_cloned_tls(struct thread *td, void *desc) { struct segment_descriptor sd; struct l_user_desc info; int idx, error; int a[2]; error = copyin(desc, &info, sizeof(struct l_user_desc)); if (error) { linux_msg(td, "set_cloned_tls copyin failed!"); } else { idx = info.entry_number; /* * looks like we're getting the idx we returned * in the set_thread_area() syscall */ if (idx != 6 && idx != 3) { linux_msg(td, "set_cloned_tls resetting idx!"); idx = 3; } /* this doesnt happen in practice */ if (idx == 6) { /* we might copy out the entry_number as 3 */ info.entry_number = 3; error = copyout(&info, desc, sizeof(struct l_user_desc)); if (error) linux_msg(td, "set_cloned_tls copyout failed!"); } a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); memcpy(&sd, &a, sizeof(a)); /* set %gs */ td->td_pcb->pcb_gsd = sd; td->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL); } return (error); } int linux_set_upcall(struct thread *td, register_t stack) { if (stack) td->td_frame->tf_esp = stack; /* * The newly created Linux thread returns * to the user space by the same path that a parent do. */ td->td_frame->tf_eax = 0; return (0); } int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { return (linux_mmap_common(td, args->addr, args->len, args->prot, args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff * PAGE_SIZE)); } int linux_mmap(struct thread *td, struct linux_mmap_args *args) { int error; struct l_mmap_argv linux_args; error = copyin(args->ptr, &linux_args, sizeof(linux_args)); if (error) return (error); return (linux_mmap_common(td, linux_args.addr, linux_args.len, linux_args.prot, linux_args.flags, linux_args.fd, (uint32_t)linux_args.pgoff)); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len, uap->prot)); } int linux_madvise(struct thread *td, struct linux_madvise_args *uap) { return (linux_madvise_common(td, PTROUT(uap->addr), uap->len, uap->behav)); } int linux_ioperm(struct thread *td, struct linux_ioperm_args *args) { int error; struct i386_ioperm_args iia; iia.start = args->start; iia.length = args->length; iia.enable = args->enable; error = i386_set_ioperm(td, &iia); return (error); } int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; if (args->level < 0 || args->level > 3) return (EINVAL); if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) | (args->level * (PSL_IOPL / 3)); return (0); } int linux_modify_ldt(struct thread *td, struct linux_modify_ldt_args *uap) { int error; struct i386_ldt_args ldt; struct l_descriptor ld; union descriptor desc; int size, written; switch (uap->func) { case 0x00: /* read_ldt */ ldt.start = 0; ldt.descs = uap->ptr; ldt.num = uap->bytecount / sizeof(union descriptor); error = i386_get_ldt(td, &ldt); td->td_retval[0] *= sizeof(union descriptor); break; case 0x02: /* read_default_ldt = 0 */ size = 5*sizeof(struct l_desc_struct); if (size > uap->bytecount) size = uap->bytecount; for (written = error = 0; written < size && error == 0; written++) error = subyte((char *)uap->ptr + written, 0); td->td_retval[0] = written; break; case 0x01: /* write_ldt */ case 0x11: /* write_ldt */ if (uap->bytecount != sizeof(ld)) return (EINVAL); error = copyin(uap->ptr, &ld, sizeof(ld)); if (error) return (error); ldt.start = ld.entry_number; ldt.descs = &desc; ldt.num = 1; desc.sd.sd_lolimit = (ld.limit & 0x0000ffff); desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16; desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff); desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24; desc.sd.sd_type = SDT_MEMRO | ((ld.read_exec_only ^ 1) << 1) | (ld.contents << 2); desc.sd.sd_dpl = 3; desc.sd.sd_p = (ld.seg_not_present ^ 1); desc.sd.sd_xx = 0; desc.sd.sd_def32 = ld.seg_32bit; desc.sd.sd_gran = ld.limit_in_pages; error = i386_set_ldt(td, &ldt, &desc); break; default: error = ENOSYS; break; } if (error == EOPNOTSUPP) { linux_msg(td, "modify_ldt needs kernel option USER_LDT"); error = ENOSYS; } return (error); } int linux_sigaction(struct thread *td, struct linux_sigaction_args *args) { l_osigaction_t osa; l_sigaction_t act, oact; int error; if (args->nsa != NULL) { error = copyin(args->nsa, &osa, sizeof(l_osigaction_t)); if (error) return (error); act.lsa_handler = osa.lsa_handler; act.lsa_flags = osa.lsa_flags; act.lsa_restorer = osa.lsa_restorer; LINUX_SIGEMPTYSET(act.lsa_mask); act.lsa_mask.__mask = osa.lsa_mask; } error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL, args->osa ? &oact : NULL); if (args->osa != NULL && !error) { osa.lsa_handler = oact.lsa_handler; osa.lsa_flags = oact.lsa_flags; osa.lsa_restorer = oact.lsa_restorer; osa.lsa_mask = oact.lsa_mask.__mask; error = copyout(&osa, args->osa, sizeof(l_osigaction_t)); } return (error); } /* * Linux has two extra args, restart and oldmask. We dont use these, * but it seems that "restart" is actually a context pointer that * enables the signal to happen with a different register set. */ int linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args) { sigset_t sigmask; l_sigset_t mask; LINUX_SIGEMPTYSET(mask); mask.__mask = args->mask; linux_to_bsd_sigset(&mask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_pause(struct thread *td, struct linux_pause_args *args) { struct proc *p = td->td_proc; sigset_t sigmask; PROC_LOCK(p); sigmask = td->td_sigmask; PROC_UNLOCK(p); return (kern_sigsuspend(td, sigmask)); } int linux_set_thread_area(struct thread *td, struct linux_set_thread_area_args *args) { struct l_user_desc info; int error; int idx; int a[2]; struct segment_descriptor sd; error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); idx = info.entry_number; /* * Semantics of Linux version: every thread in the system has array of * 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This * syscall loads one of the selected tls decriptors with a value and * also loads GDT descriptors 6, 7 and 8 with the content of the * per-thread descriptors. * * Semantics of FreeBSD version: I think we can ignore that Linux has 3 * per-thread descriptors and use just the 1st one. The tls_array[] * is used only in set/get-thread_area() syscalls and for loading the * GDT descriptors. In FreeBSD we use just one GDT descriptor for TLS * so we will load just one. * * XXX: this doesn't work when a user space process tries to use more * than 1 TLS segment. Comment in the Linux sources says wine might do * this. */ /* * we support just GLIBC TLS now * we should let 3 proceed as well because we use this segment so * if code does two subsequent calls it should succeed */ if (idx != 6 && idx != -1 && idx != 3) return (EINVAL); /* * we have to copy out the GDT entry we use * FreeBSD uses GDT entry #3 for storing %gs so load that * * XXX: what if a user space program doesn't check this value and tries * to use 6, 7 or 8? */ idx = info.entry_number = 3; error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (error); if (LINUX_LDT_empty(&info)) { a[0] = 0; a[1] = 0; } else { a[0] = LINUX_LDT_entry_a(&info); a[1] = LINUX_LDT_entry_b(&info); } memcpy(&sd, &a, sizeof(a)); /* this is taken from i386 version of cpu_set_user_tls() */ critical_enter(); /* set %gs */ td->td_pcb->pcb_gsd = sd; PCPU_GET(fsgs_gdt)[1] = sd; load_gs(GSEL(GUGS_SEL, SEL_UPL)); critical_exit(); return (0); } int linux_get_thread_area(struct thread *td, struct linux_get_thread_area_args *args) { struct l_user_desc info; int error; int idx; struct l_desc_struct desc; struct segment_descriptor sd; error = copyin(args->desc, &info, sizeof(struct l_user_desc)); if (error) return (error); idx = info.entry_number; /* XXX: I am not sure if we want 3 to be allowed too. */ if (idx != 6 && idx != 3) return (EINVAL); idx = 3; memset(&info, 0, sizeof(info)); sd = PCPU_GET(fsgs_gdt)[1]; memcpy(&desc, &sd, sizeof(desc)); info.entry_number = idx; info.base_addr = LINUX_GET_BASE(&desc); info.limit = LINUX_GET_LIMIT(&desc); info.seg_32bit = LINUX_GET_32BIT(&desc); info.contents = LINUX_GET_CONTENTS(&desc); info.read_exec_only = !LINUX_GET_WRITABLE(&desc); info.limit_in_pages = LINUX_GET_LIMIT_PAGES(&desc); info.seg_not_present = !LINUX_GET_PRESENT(&desc); info.useable = LINUX_GET_USEABLE(&desc); error = copyout(&info, args->desc, sizeof(struct l_user_desc)); if (error) return (EFAULT); return (0); } /* XXX: this wont work with module - convert it */ int linux_mq_open(struct thread *td, struct linux_mq_open_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_open(td, (struct kmq_open_args *)args)); #else return (ENOSYS); #endif } int linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_unlink(td, (struct kmq_unlink_args *)args)); #else return (ENOSYS); #endif } int linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_timedsend(td, (struct kmq_timedsend_args *)args)); #else return (ENOSYS); #endif } int linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_timedreceive(td, (struct kmq_timedreceive_args *)args)); #else return (ENOSYS); #endif } int linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_notify(td, (struct kmq_notify_args *)args)); #else return (ENOSYS); #endif } int linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args) { #ifdef P1003_1B_MQUEUE return (sys_kmq_setattr(td, (struct kmq_setattr_args *)args)); #else return (ENOSYS); #endif } void bsd_to_linux_regset(const struct reg *b_reg, struct linux_pt_regset *l_regset) { l_regset->ebx = b_reg->r_ebx; l_regset->ecx = b_reg->r_ecx; l_regset->edx = b_reg->r_edx; l_regset->esi = b_reg->r_esi; l_regset->edi = b_reg->r_edi; l_regset->ebp = b_reg->r_ebp; l_regset->eax = b_reg->r_eax; l_regset->ds = b_reg->r_ds; l_regset->es = b_reg->r_es; l_regset->fs = b_reg->r_fs; l_regset->gs = b_reg->r_gs; l_regset->orig_eax = b_reg->r_eax; l_regset->eip = b_reg->r_eip; l_regset->cs = b_reg->r_cs; l_regset->eflags = b_reg->r_eflags; l_regset->esp = b_reg->r_esp; l_regset->ss = b_reg->r_ss; } int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; vm_map_t map; vm_map_entry_t entry; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; - char *library; ssize_t aresid; int error; bool locked, opened, textset; a_out = NULL; vp = NULL; locked = false; textset = false; opened = false; - if (!LUSECONVPATH(td)) { - NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, - UIO_USERSPACE, args->library); - error = namei(&ni); - } else { - LCONVPATHEXIST(args->library, &library); - NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, - UIO_SYSSPACE, library); - error = namei(&ni); - LFREEPATH(library); - } + NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, + UIO_USERSPACE, args->library); + error = namei(&ni); if (error) goto cleanup; vp = ni.ni_vp; NDFREE_PNBUF(&ni); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = true; /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; opened = true; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. */ error = VOP_SET_TEXT(vp); if (error != 0) goto cleanup; textset = true; /* * Lock no longer needed */ locked = false; VOP_UNLOCK(vp); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ map = &td->td_proc->p_vmspace->vm_map; error = vm_mmap(map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; vm_map_lock(map); if (!vm_map_lookup_entry(map, vmaddr, &entry)) { vm_map_unlock(map); error = EDOOFUS; goto cleanup; } entry->eflags |= MAP_ENTRY_VN_EXEC; vm_map_unlock(map); textset = false; } if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: if (opened) { if (locked) VOP_UNLOCK(vp); locked = false; VOP_CLOSE(vp, FREAD, td->td_ucred, td); } if (textset) { if (!locked) { locked = true; VOP_LOCK(vp, LK_SHARED | LK_RETRY); } VOP_UNSET_TEXT_CHECKED(vp); } if (locked) VOP_UNLOCK(vp); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index 55b20200cadf..ab6eaa97075d 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -1,912 +1,907 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - LINUX_VDSOPAGE_SIZE) #define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX_USRSTACK LINUX_SHAREDPAGE #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux_vdso_so_o_start; extern char _binary_linux_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; extern const char *linux_syscallnames[]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(uintptr_t *stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static void linux_set_fork_retval(struct thread *td); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); LINUX_VDSO_SYM_CHAR(linux_platform); LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall); LINUX_VDSO_SYM_INTPTR(linux_vdso_sigcode); LINUX_VDSO_SYM_INTPTR(linux_vdso_rt_sigcode); LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); LINUX_VDSO_SYM_INTPTR(kern_cpu_selector); static int linux_fixup(uintptr_t *stack_base, struct image_params *imgp) { register_t *base, *argv, *envp; base = (register_t *)*stack_base; argv = base; envp = base + (imgp->args->argc + 1); base--; suword(base, (intptr_t)envp); base--; suword(base, (intptr_t)argv); base--; suword(base, imgp->args->argc); *stack_base = (uintptr_t)base; return (0); } void linux32_arch_copyout_auxargs(struct image_params *imgp, Elf_Auxinfo **pos) { AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY((*pos), LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY((*pos), LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY((*pos), LINUX_AT_PLATFORM, PTROUT(linux_platform)); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno); code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_uc); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* Build the signal context to be used by sigreturn. */ frame.sf_uc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_uc.uc_sigmask); frame.sf_uc.uc_mcontext.sc_mask = frame.sf_uc.uc_sigmask.__mask; frame.sf_uc.uc_mcontext.sc_gs = rgs(); frame.sf_uc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_uc.uc_mcontext.sc_es = regs->tf_es; frame.sf_uc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_uc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_uc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_uc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_uc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_uc.uc_mcontext.sc_esp = regs->tf_esp; frame.sf_uc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_uc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_uc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_uc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_uc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_uc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_uc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_uc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_uc.uc_mcontext.sc_err = regs->tf_err; frame.sf_uc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_uc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = PTROUT(fp); regs->tf_eip = linux_vdso_rt_sigcode; regs->tf_edi = PTROUT(catcher); regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = linux_translate_traps(ksi->ksi_signo, ksi->ksi_trapno); mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_sig = sig; frame.sf_sigmask = *mask; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_esp = regs->tf_esp; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = PTROUT(fp); regs->tf_eip = linux_vdso_sigcode; regs->tf_edi = PTROUT(catcher); regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } kern_sigprocmask(td, SIG_SETMASK, &frame.sf_sigmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* Call sigaltstack & ignore results. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_eax; sa->original_code = sa->code; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_eax = bsd_to_linux_errno(error); } } static void linux_set_fork_retval(struct thread *td) { struct trapframe *frame = td->td_frame; frame->tf_eax = 0; } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel. */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, - .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32 | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = linux_syscallnames, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = NULL, .sv_hwcap2 = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, .sv_set_fork_retval = linux_set_fork_retval, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_fixup = __elfN(freebsd_fixup), .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = __linuxN(prepare_notes), - .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __linuxN(copyout_auxargs), .sv_copyout_strings = __linuxN(copyout_strings), .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = NULL, .sv_hwcap2 = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, .sv_set_fork_retval = linux_set_fork_retval, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error = 0; if (SV_PROC_FLAG(p, SV_SHP) != 0) error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); if (error == 0) - linux_on_exec(p, imgp); + error = linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_shared_page_base + sv->sv_timekeep_offset; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); tkoff = kern_cpu_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_cpu_selector_idx(); if (bootverbose) printf("Linux i386 vDSO cpu_selector: %u\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux_vdso_so_o_start; char *vdso_end = &_binary_linux_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); linux_vdso_base = LINUX_VDSOPAGE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Shdr *shdr; const Elf_Rel *rel; const Elf_Ehdr *ehdr; Elf_Addr *where; Elf_Size rtype, symidx; Elf_Addr addr, addend; int i, relcnt; MPASS(offset != 0); relcnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); relcnt = shdr[i].sh_size / sizeof(*rel); break; case SHT_RELA: printf("Linux i386 vDSO: unexpected Rela section\n"); break; } } for (i = 0; i < relcnt; i++, rel++) { where = (Elf_Addr *)(mapping + rel->r_offset); addend = *where; rtype = ELF_R_TYPE(rel->r_info); symidx = ELF_R_SYM(rel->r_info); switch (rtype) { case R_386_NONE: /* none */ break; case R_386_RELATIVE: /* B + A */ addr = (Elf_Addr)PTROUT(offset + addend); if (*where != addr) *where = addr; break; case R_386_IRELATIVE: printf("Linux i386 vDSO: unexpected ifunc relocation, " "symbol index %d\n", symidx); break; default: printf("Linux i386 vDSO: unexpected relocation type %d, " "symbol index %d\n", rtype, symidx); } } } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", - .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE | LINUX_BI_FUTEX_REQUEUE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); linux_dev_shm_create(); linux_osd_jail_register(); linux_netlink_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); linux_netlink_deregister(); linux_dev_shm_destroy(); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, netlink, 1, 1, 1); FEATURE(linux, "Linux 32bit support");