diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index 2d3406582a33..7be7411a960a 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -1,956 +1,1057 @@ /*- * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux64, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \ + LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48 +#define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \ + sizeof(struct ps_strings)) + static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_vsyscall(struct thread *td); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; sa->code = frame->tf_rax; sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_rax = td->td_retval[0]; frame->tf_r10 = frame->tf_rcx; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes, * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. * We saved this in tf_err. * */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; break; case EJUSTRETURN: break; default: frame->tf_rax = bsd_to_linux_errno(error); frame->tf_r10 = frame->tf_rcx; break; } /* * Differently from FreeBSD native ABI, on Linux only %rcx * and %r11 values are not preserved across the syscall. * Require full context restore to get all registers except * those two restored at return to usermode. * * XXX: Would be great to be able to avoid PCB_FULL_IRET * for the error == 0 case. */ set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { Elf_Auxargs *args; Elf_Auxinfo *argarray, *pos; struct proc *p; int error, issetugid; p = imgp->proc; args = (Elf64_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf_Addr *base; base = (Elf64_Addr *)*stack_base; base--; if (suword(base, (uint64_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf64_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * Starting with 2.24, glibc depends on a 16-byte stack alignment. * One "long argc" will be prepended later. */ vectp = (char **)((((uintptr_t)vectp + 8) & ~0xF) - 8); /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; set_pcb_flags(pcb, PCB_FULL_IRET); saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } /* * Copied from amd64/amd64/machdep.c * * XXX fpu state need? don't think so */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_ucontext uc; struct l_sigcontext *context; struct trapframe *regs; unsigned long rflags; int error; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); if (error != 0) return (error); p = td->td_proc; context = &uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; int sig, code; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Save user context. */ bzero(&sf, sizeof(sf)); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe); } else sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig); sf.sf_handler = catcher; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #define LINUX_VSYSCALL_START (-10UL << 20) #define LINUX_VSYSCALL_SZ 1024 const unsigned long linux_vsyscall_vector[] = { LINUX_SYS_gettimeofday, LINUX_SYS_linux_time, LINUX_SYS_linux_getcpu, }; static int linux_vsyscall(struct thread *td) { struct trapframe *frame; uint64_t retqaddr; int code, traced; int error; frame = td->td_frame; /* Check %rip for vsyscall area. */ if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) return (EINVAL); if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) return (EINVAL); code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; if (code >= nitems(linux_vsyscall_vector)) return (EINVAL); /* * vsyscall called as callq *(%rax), so we must * use return address from %rsp and also fixup %rsp. */ error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); if (error) return (error); frame->tf_rip = retqaddr; frame->tf_rax = linux_vsyscall_vector[code]; frame->tf_rsp += 8; traced = (frame->tf_flags & PSL_T); amd64_syscall(td, traced); return (0); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_rt_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, - .sv_usrstack = USRSTACK_LA48, - .sv_psstrings = PS_STRINGS_LA48, + .sv_usrstack = LINUX_USRSTACK_LA48, + .sv_psstrings = LINUX_PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | - SV_SIG_WAITNDQ, + SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE_LA48, + .sv_shared_page_base = LINUX_SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { + int error; - linux_on_exec(p, imgp); - return (0); + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); } static void -linux_vdso_install(void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + amd64_lower_shared_page(sv); + /* Fill timekeep_base */ + exec_sysvec_init(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - amd64_lower_shared_page(&elf_linux_sysvec); - - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX_VDSOPAGE_LA48; + if (hw_lower_amd64_sharedpage != 0) + linux_vdso_base -= PAGE_SIZE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Ehdr *ehdr; + const Elf_Shdr *shdr; + Elf64_Addr *where, val; + Elf_Size rtype, symidx; + const Elf_Rela *rela; + Elf_Addr addr, addend; + int relacnt; + int i, j; + + MPASS(offset != 0); + + relacnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + printf("Linux x86_64 vDSO: unexpected Rel section\n"); + break; + case SHT_RELA: + rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); + relacnt = shdr[i].sh_size / sizeof(*rela); + } + } + + for (j = 0; j < relacnt; j++, rela++) { + where = (Elf_Addr *)(mapping + rela->r_offset); + addend = rela->r_addend; + rtype = ELF_R_TYPE(rela->r_info); + symidx = ELF_R_SYM(rela->r_info); + + switch (rtype) { + case R_X86_64_NONE: /* none */ + break; + + case R_X86_64_RELATIVE: /* B + A */ + addr = (Elf_Addr)(offset + addend); + val = addr; + if (*where != val) + *where = val; + break; + case R_X86_64_IRELATIVE: + printf("Linux x86_64 vDSO: unexpected ifunc relocation, " + "symbol index %ld\n", symidx); + break; + default: + printf("Linux x86_64 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", rtype, symidx); + } + } +} + static char GNULINUX_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNULINUX_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-x86_64.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux x86-64 ELF exec handler installed\n"); } else printf("cannot insert Linux x86-64 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "Linux 64bit support"); diff --git a/sys/amd64/linux/linux_vdso.lds.s b/sys/amd64/linux/linux_vdso.lds.s index 94f0266095fb..ccf7c80565bb 100644 --- a/sys/amd64/linux/linux_vdso.lds.s +++ b/sys/amd64/linux/linux_vdso.lds.s @@ -1,69 +1,73 @@ /* * Linker script for 64-bit vDSO. * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S * * $FreeBSD$ */ SECTIONS { . = . + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .note : { *(.note.*) } :text :note .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text .data : { *(.data*) *(.sdata*) *(.got.plt) *(.got) *(.gnu.linkonce.d.*) *(.bss*) *(.dynbss*) *(.gnu.linkonce.b.*) } .altinstructions : { *(.altinstructions) } .altinstr_replacement : { *(.altinstr_replacement) } . = ALIGN(0x100); .text : { *(.test .text*) } :text =0x90909090 } PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; } VERSION { LINUX_2.6 { global: - time; __vdso_time; - gettimeofday; __vdso_gettimeofday; - getcpu; __vdso_getcpu; - clock_gettime; __vdso_clock_gettime; + __vdso_clock_getres; + local: *; + }; + + LINUX_0.0 { + global: linux_rt_sigcode; linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } diff --git a/sys/amd64/linux/linux_vdso_gtod.c b/sys/amd64/linux/linux_vdso_gtod.c new file mode 100644 index 000000000000..ad23dc33575a --- /dev/null +++ b/sys/amd64/linux/linux_vdso_gtod.c @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +/* for debug purpose */ +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_write), "D"(fd), "S"(buf), "d"(size) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime), "D"(clock_id), "S"(ts) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_gettimeofday), "D"(tv), "S"(tz) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_getres), "D"(clock_id), "S"(ts) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_getcpu_fallback(uint32_t *cpu, uint32_t *node, void *cache) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_getcpu), "D"(cpu), "S"(node), "d"(cache) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_time), "D"(tm) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +#include diff --git a/sys/amd64/linux32/linux32_locore.asm b/sys/amd64/linux32/linux32_locore.asm index 7dbde5a2c854..f96b3e730f9f 100644 --- a/sys/amd64/linux32/linux32_locore.asm +++ b/sys/amd64/linux32/linux32_locore.asm @@ -1,156 +1,156 @@ /* $FreeBSD$ */ #include "linux32_assym.h" /* system definitions */ #include /* miscellaneous asm macros */ #include /* system call numbers */ .data .globl linux_platform linux_platform: .asciz "i686" .text .code32 /* * To avoid excess stack frame the signal trampoline code emulates * the 'call' instruction. */ -ENTRY(linux32_sigcode) +ENTRY(__kernel_sigreturn) movl %esp, %ebx /* preserve sigframe */ call .getip0 .getip0: popl %eax add $.startsigcode-.getip0, %eax /* ret address */ push %eax jmp *LINUX_SIGF_HANDLER(%ebx) .startsigcode: popl %eax movl $LINUX32_SYS_linux_sigreturn,%eax /* linux_sigreturn() */ int $0x80 /* enter kernel with args */ .endsigcode: 0: jmp 0b -ENTRY(linux32_rt_sigcode) +ENTRY(__kernel_rt_sigreturn) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */ movl %esp, %edi call .getip1 .getip1: popl %eax add $.startrtsigcode-.getip1, %eax /* ret address */ push %eax jmp *LINUX_RT_SIGF_HANDLER(%edi) .startrtsigcode: movl $LINUX32_SYS_linux_rt_sigreturn,%eax /* linux_rt_sigreturn() */ int $0x80 /* enter kernel with args */ .endrtsigcode: 0: jmp 0b -ENTRY(linux32_vsyscall) +ENTRY(__kernel_vsyscall) .startvsyscall: int $0x80 ret .endvsyscall: #if 0 .section .note.Linux, "a",@note .long 2f - 1f /* namesz */ .balign 4 .long 4f - 3f /* descsz */ .long 0 1: .asciz "Linux" 2: .balign 4 3: .long LINUX_VERSION_CODE 4: .balign 4 .previous #endif #define do_cfa_expr(offset) \ .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ .uleb128 11f-10f; /* length */ \ 10: .byte 0x74; /* DW_OP_breg4 */ \ .sleb128 offset; /* offset */ \ .byte 0x06; /* DW_OP_deref */ \ 11: /* CIE */ .section .eh_frame,"a",@progbits .LSTARTFRAMEDLSI1: .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 .LSTARTCIEDLSI1: .long 0 /* CIE ID */ .byte 1 /* Version number */ .string "zRS" /* NULL-terminated * augmentation string */ .uleb128 1 /* Code alignment factor */ .sleb128 -4 /* Data alignment factor */ .byte 8 /* Return address * register column */ .uleb128 1 /* Augmentation value length */ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ .byte 0 /* DW_CFA_nop */ .align 4 .LENDCIEDLSI1: /* FDE */ .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ .LSTARTFDEDLSI1: .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ .long .startsigcode-. /* PC-relative start address */ .long .endsigcode-.startsigcode .uleb128 0 /* Augmentation */ do_cfa_expr(LINUX_SIGF_SC-8) .align 4 .LENDFDEDLSI1: .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ .LSTARTFDEDLSI2: .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ .long .startrtsigcode-. /* PC-relative start address */ .long .endrtsigcode-.startrtsigcode .uleb128 0 /* Augmentation */ do_cfa_expr(LINUX_RT_SIGF_SC-4+LINUX_SC_ESP) .align 4 .LENDFDEDLSI2: .previous .section .eh_frame,"a",@progbits .LSTARTFRAMEDLSI2: .long .LENDCIEDLSI2-.LSTARTCIEDLSI2 .LSTARTCIEDLSI2: .long 0 /* CIE ID */ .byte 1 /* Version number */ .string "zR" /* NULL-terminated * augmentation string */ .uleb128 1 /* Code alignment factor */ .sleb128 -4 /* Data alignment factor */ .byte 8 /* Return address register column */ .uleb128 1 /* Augmentation value length */ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ .byte 0x0c /* DW_CFA_def_cfa */ .uleb128 4 .uleb128 4 .byte 0x88 /* DW_CFA_offset, column 0x8 */ .uleb128 1 .align 4 .LENDCIEDLSI2: .long .LENDFDEDLSI3-.LSTARTFDEDLSI3 /* Length FDE */ .LSTARTFDEDLSI3: .long .LSTARTFDEDLSI3-.LSTARTFRAMEDLSI2 /* CIE pointer */ .long .startvsyscall-. /* PC-relative start address */ .long .endvsyscall-.startvsyscall .uleb128 0 .align 4 .LENDFDEDLSI3: .previous diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 90b6207a50c9..165eb7b2a1d2 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -1,1126 +1,1220 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE) -#define LINUX32_SHAREDPAGE (LINUX32_MAXUSER - PAGE_SIZE) +#define LINUX32_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX32_VDSOPAGE (LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE) +#define LINUX32_SHAREDPAGE (LINUX32_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ #define LINUX32_USRSTACK LINUX32_SHAREDPAGE static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux32_locore_o_start; -extern char _binary_linux32_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux32_vdso_so_o_start; +extern char _binary_linux32_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static void linux32_fixlimit(struct rlimit *rl, int which); static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux32_set_syscall_retval(struct thread *td, int error); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux32_vsyscall); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary)); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp)); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf32_Addr *base; base = (Elf32_Addr *)*stack_base; base--; if (suword32(base, (uint32_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); - regs->tf_rip = linux32_rt_sigcode; + regs->tf_rip = __kernel_rt_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); - regs->tf_rip = linux32_sigcode; + regs->tf_rip = __kernel_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; lmask.__mask = frame.sf_extramask[0]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void linux32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_rax = bsd_to_linux_errno(error); } } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = (register_t)imgp->ps_strings; x86_clear_dbregs(pcb); fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); } /* * XXX copied from ia32_sysvec.c. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; u_int32_t *vectp; char *stringp; uintptr_t destp, ustringp; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(uint32_t)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(uint32_t)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(uint32_t)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(uint32_t)); } vectp = (uint32_t *)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword32(vectp++, 0) != 0) return (EFAULT); if (suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword32(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux32_locore_o_start, + .sv_sigcode = &_binary_linux32_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP | - SV_SIG_DISCIGN | SV_SIG_WAITNDQ, + SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux32_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { + int error; - linux_on_exec(p, imgp); - return (0); + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX32_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); } static void -linux_vdso_install(void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - linux_szsigcode = (&_binary_linux32_locore_o_end - - &_binary_linux32_locore_o_start); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux32_vdso_so_o_start; + char *vdso_end = &_binary_linux32_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX32_VDSOPAGE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Shdr *shdr; + const Elf_Rel *rel; + const Elf_Ehdr *ehdr; + Elf32_Addr *where; + Elf_Size rtype, symidx; + Elf32_Addr addr, addend; + int i, relcnt; + + MPASS(offset != 0); + + relcnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); + relcnt = shdr[i].sh_size / sizeof(*rel); + break; + case SHT_RELA: + printf("Linux i386 vDSO: unexpected Rela section\n"); + break; + } + } + + for (i = 0; i < relcnt; i++, rel++) { + where = (Elf32_Addr *)(mapping + rel->r_offset); + addend = *where; + rtype = ELF_R_TYPE(rel->r_info); + symidx = ELF_R_SYM(rel->r_info); + + switch (rtype) { + case R_386_NONE: /* none */ + break; + + case R_386_RELATIVE: /* B + A */ + addr = (Elf32_Addr)PTROUT(offset + addend); + if (*where != addr) + *where = addr; + break; + + case R_386_IRELATIVE: + printf("Linux i386 vDSO: unexpected ifunc relocation, " + "symbol index %ld\n", (intmax_t)symidx); + break; + default: + printf("Linux i386 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/amd64/linux32/linux32_vdso.lds.s b/sys/amd64/linux32/linux32_vdso.lds.s index a49c209a1ebc..0a392e6380b6 100644 --- a/sys/amd64/linux32/linux32_vdso.lds.s +++ b/sys/amd64/linux32/linux32_vdso.lds.s @@ -1,66 +1,80 @@ /* * Linker script for 32-bit vDSO. * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S * and arch/x86/vdso/vdso32/vdso32.lds.S * * $FreeBSD$ */ SECTIONS { . = . + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .note : { *(.note.*) } :text :note .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text .data : { *(.data*) *(.sdata*) *(.got.plt) *(.got) *(.gnu.linkonce.d.*) *(.bss*) *(.dynbss*) *(.gnu.linkonce.b.*) } .altinstructions : { *(.altinstructions) } .altinstr_replacement : { *(.altinstr_replacement) } . = ALIGN(0x100); .text : { *(.text*) } :text =0x90909090 } PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; } -ENTRY(linux32_vsyscall); - VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; + }; + LINUX_2.5 { global: - linux32_vsyscall; - linux32_sigcode; - linux32_rt_sigcode; + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; + + LINUX_0.0 { + global: linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } diff --git a/sys/amd64/linux32/linux32_vdso_gtod.c b/sys/amd64/linux32/linux32_vdso_gtod.c new file mode 100644 index 000000000000..f1573ca3c1b1 --- /dev/null +++ b/sys/amd64/linux32/linux32_vdso_gtod.c @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_write), "b"(fd), "c"(buf), "d"(size) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_gettimeofday), "b"(tv), "c"(tz) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_getres), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_time), "b"(tm) + : "cc", "memory" + ); + return (res); +} + +#include diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index 706b97894f9c..1d467540fd36 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -1,589 +1,681 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * Copyright (c) 2018 Turing Robotic Industries Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif MODULE_VERSION(linux64elf, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - \ + LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK LINUX_SHAREDPAGE +#define LINUX_PS_STRINGS (LINUX_USRSTACK - \ + sizeof(struct ps_strings)) + static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static int linux_elf_fixup(uintptr_t *stack_base, struct image_params *iparams); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /* DTrace probes */ LIN_SDT_PROBE_DEFINE2(sysvec, linux_translate_traps, todo, "int", "int"); LIN_SDT_PROBE_DEFINE0(sysvec, linux_exec_setregs, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_copyout_auxargs, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_elf_fixup, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_rt_sigreturn, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_rt_sendsig, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_install, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_deinstall, todo); +LINUX_VDSO_SYM_CHAR(linux_platform); +LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); +LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn); + /* LINUXTODO: do we have traps to translate? */ static int linux_translate_traps(int signal, int trap_code) { LIN_SDT_PROBE2(sysvec, linux_translate_traps, todo, signal, trap_code); return (signal); } -LINUX_VDSO_SYM_CHAR(linux_platform); - static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct syscall_args *sa; register_t *ap; p = td->td_proc; ap = td->td_frame->tf_x; sa = &td->td_sa; sa->code = td->td_frame->tf_x[8]; sa->original_code = sa->code; /* LINUXTODO: generic syscall? */ if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; if (sa->callp->sy_narg > MAXARGS) panic("ARM64TODO: Could we have more than %d args?", MAXARGS); memcpy(sa->args, ap, MAXARGS * sizeof(register_t)); td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { td->td_retval[1] = td->td_frame->tf_x[1]; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) td->td_frame->tf_x[0] = bsd_to_linux_errno(error); } } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { Elf_Auxargs *args; Elf_Auxinfo *argarray, *pos; struct proc *p; int error, issetugid; LIN_SDT_PROBE0(sysvec, linux_copyout_auxargs, todo); p = imgp->proc; args = (Elf64_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, *imgp->sysent->sv_hwcap); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, *imgp->sysent->sv_hwcap2); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_elf_fixup(uintptr_t *stack_base, struct image_params *imgp) { LIN_SDT_PROBE0(sysvec, linux_elf_fixup, todo); return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. * LINUXTODO: deduplicate against other linuxulator archs */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; int argc, envc, error; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf64_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for argc and the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= 1 + imgp->args->argc + 1 + imgp->args->envc + 1; vectp = (char **)STACKALIGN(vectp); /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); if (suword(vectp++, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* LINUXTODO: validate */ LIN_SDT_PROBE0(sysvec, linux_exec_setregs, todo); memset(regs, 0, sizeof(*regs)); /* glibc start.S registers function pointer in x0 with atexit. */ regs->tf_sp = stack; #if 0 /* LINUXTODO: See if this is used. */ regs->tf_lr = imgp->entry_addr; #else regs->tf_lr = 0xffffffffffffffff; #endif regs->tf_elr = imgp->entry_addr; pcb->pcb_tpidr_el0 = 0; pcb->pcb_tpidrro_el0 = 0; WRITE_SPECIALREG(tpidrro_el0, 0); WRITE_SPECIALREG(tpidr_el0, 0); #ifdef VFP vfp_reset_state(td, pcb); #endif /* * Clear debug register state. It is not applicable to the new process. */ bzero(&pcb->pcb_dbg_regs, sizeof(pcb->pcb_dbg_regs)); } int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { /* LINUXTODO: implement */ LIN_SDT_PROBE0(sysvec, linux_rt_sigreturn, todo); return (EDOOFUS); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { /* LINUXTODO: implement */ LIN_SDT_PROBE0(sysvec, linux_rt_sendsig, todo); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_elf_fixup, .sv_sendsig = linux_rt_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, /* XXX */ + .sv_usrstack = LINUX_USRSTACK, + .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | - SV_SIG_WAITNDQ, + SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = &elf_hwcap, .sv_hwcap2 = &elf_hwcap2, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { + int error; - linux_on_exec(p, imgp); - return (0); + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); } static void -linux_vdso_install(const void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); +static void +linux_vdso_install(const void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("invalid Linux VDSO size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX_VDSOPAGE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - memcpy(linux_shared_page_mapping, elf_linux_sysvec.sv_sigcode, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { - LIN_SDT_PROBE0(sysvec, linux_vdso_deinstall, todo); - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + Elf_Size rtype, symidx; + const Elf_Rela *rela; + const Elf_Shdr *shdr; + const Elf_Ehdr *ehdr; + Elf_Addr *where; + Elf_Addr addr, addend; + int i, relacnt; + + MPASS(offset != 0); + + relacnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + printf("Linux Aarch64 vDSO: unexpected Rel section\n"); + break; + case SHT_RELA: + rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); + relacnt = shdr[i].sh_size / sizeof(*rela); + } + } + + for (i = 0; i < relacnt; i++, rela++) { + where = (Elf_Addr *)(mapping + rela->r_offset); + addend = rela->r_addend; + rtype = ELF_R_TYPE(rela->r_info); + symidx = ELF_R_SYM(rela->r_info); + + switch (rtype) { + case R_AARCH64_NONE: /* none */ + break; + + case R_AARCH64_RELATIVE: /* B + A */ + addr = (Elf_Addr)(mapping + addend); + if (*where != addr) + *where = addr; + break; + default: + printf("Linux Aarch64 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", rtype, symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNU_ABI_LINUX = 0; /* LINUXTODO: deduplicate */ static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNU_ABI_LINUX) return (false); *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; struct linux_ioctl_handler**lihp; int error; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux arm64 ELF exec handler installed\n"); } break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "AArch64 Linux 64bit support"); diff --git a/sys/arm64/linux/linux_vdso.lds.s b/sys/arm64/linux/linux_vdso.lds.s index 86f8de91bf60..98cbb9a5736b 100644 --- a/sys/arm64/linux/linux_vdso.lds.s +++ b/sys/arm64/linux/linux_vdso.lds.s @@ -1,22 +1,73 @@ /* - * Stub arm64 vdso linker script. - * LINUXTODO: update along with VDSO implementation + * Linker script for 64-bit vDSO. + * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S * * $FreeBSD$ */ SECTIONS { . = . + SIZEOF_HEADERS; - .text : { *(.text*) } - .rodata : { *(.rodata*) } - .hash : { *(.hash) } + + .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } - .data : { *(.data*) } - .dynamic : { *(.dynamic) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + . = ALIGN(0x100); + .text : { *(.test .text*) } :text =0x90909090 +} + +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.6.39 { + global: + __kernel_rt_sigreturn; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + local: *; + }; + + LINUX_0.0 { + global: + linux_platform; + kern_timekeep_base; + local: *; + }; } diff --git a/sys/arm64/linux/linux_vdso_gtod.c b/sys/arm64/linux/linux_vdso_gtod.c new file mode 100644 index 000000000000..682735cf2fa1 --- /dev/null +++ b/sys/arm64/linux/linux_vdso_gtod.c @@ -0,0 +1,153 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +static int +write(int lfd, const void *lbuf, size_t lsize) +{ + register long svc asm("x8") = LINUX_SYS_write; + register int fd asm("x0") = lfd; + register const char *buf asm("x1") = lbuf; + register long size asm("x2") = lsize; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (fd), "r" (buf), "r" (size), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *lts) +{ + register long svc asm("x8") = LINUX_SYS_linux_clock_gettime; + register clockid_t clockid asm("x0") = clock_id; + register struct l_timespec *ts asm("x1") = lts; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (clockid), "r" (ts), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *ltv, struct timezone *ltz) +{ + register long svc asm("x8") = LINUX_SYS_gettimeofday; + register l_timeval *tv asm("x0") = ltv; + register struct timezone *tz asm("x1") = ltz; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (tv), "r" (tz), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *lts) +{ + register long svc asm("x8") = LINUX_SYS_linux_clock_getres; + register clockid_t clockid asm("x0") = clock_id; + register struct l_timespec *ts asm("x1") = lts; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (clockid), "r" (ts), "r" (svc) + : "memory"); + return (res); +} + +/* + * copied from lib/libc/aarch64/sys/__vdso_gettc.c + */ + +static inline uint64_t +cp15_cntvct_get(void) +{ + uint64_t reg; + + __asm __volatile("mrs %0, cntvct_el0" : "=r" (reg)); + return (reg); +} + +static inline uint64_t +cp15_cntpct_get(void) +{ + uint64_t reg; + + __asm __volatile("mrs %0, cntpct_el0" : "=r" (reg)); + return (reg); +} + +int +__vdso_gettc(const struct vdso_timehands *th, u_int *tc) +{ + + if (th->th_algo != VDSO_TH_ALGO_ARM_GENTIM) + return (ENOSYS); + __asm __volatile("isb" : : : "memory"); + *tc = th->th_physical == 0 ? cp15_cntvct_get() : cp15_cntpct_get(); + return (0); +} + +#include diff --git a/sys/compat/linux/linux_vdso.c b/sys/compat/linux/linux_vdso.c index a05c0b01ff40..af90941d1981 100644 --- a/sys/compat/linux/linux_vdso.c +++ b/sys/compat/linux/linux_vdso.c @@ -1,248 +1,184 @@ /*- * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) #define __ELF_WORD_SIZE 32 #else #define __ELF_WORD_SIZE 64 #endif #include #include #include +#include #include -#include +#include #include #include #include -#include #include #include #include -#include #include #include #include #include #include SLIST_HEAD(, linux_vdso_sym) __elfN(linux_vdso_syms) = SLIST_HEAD_INITIALIZER(__elfN(linux_vdso_syms)); -static int __elfN(symtabindex); -static int __elfN(symstrindex); - -static void -__elfN(linux_vdso_lookup)(Elf_Ehdr *, struct linux_vdso_sym *); - void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *s) { SLIST_INSERT_HEAD(&__elfN(linux_vdso_syms), s, sym); } vm_object_t -__elfN(linux_shared_page_init)(char **mapping) +__elfN(linux_shared_page_init)(char **mapping, vm_size_t size) { vm_page_t m; vm_object_t obj; vm_offset_t addr; + size_t n, pages; + + pages = size / PAGE_SIZE; - obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE, + addr = kva_alloc(size); + obj = vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_WLOCK(obj); - m = vm_page_grab(obj, 0, VM_ALLOC_ZERO); + for (n = 0; n < pages; n++) { + m = vm_page_grab(obj, n, + VM_ALLOC_ZERO); + vm_page_valid(m); + vm_page_xunbusy(m); + pmap_qenter(addr + n * PAGE_SIZE, &m, 1); + } VM_OBJECT_WUNLOCK(obj); - vm_page_valid(m); - vm_page_xunbusy(m); - addr = kva_alloc(PAGE_SIZE); - pmap_qenter(addr, &m, 1); *mapping = (char *)addr; return (obj); } void -__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping) +__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping, + vm_size_t size) { vm_offset_t va; va = (vm_offset_t)mapping; - pmap_qremove(va, 1); - kva_free(va, PAGE_SIZE); + pmap_qremove(va, size / PAGE_SIZE); + kva_free(va, size); vm_object_deallocate(obj); } void -__elfN(linux_vdso_fixup)(struct sysentvec *sv) +__elfN(linux_vdso_fixup)(char *base, vm_offset_t offset) { + struct linux_vdso_sym *lsym; + const Elf_Shdr *shdr; Elf_Ehdr *ehdr; - Elf_Shdr *shdr; - int i; + Elf_Sym *dsym, *sym; + char *strtab, *symname; + int i, symcnt; - ehdr = (Elf_Ehdr *) sv->sv_sigcode; + ehdr = (Elf_Ehdr *)base; - if (!IS_ELF(*ehdr) || - ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || - ehdr->e_ident[EI_DATA] != ELF_TARG_DATA || - ehdr->e_ident[EI_VERSION] != EV_CURRENT || - ehdr->e_shoff == 0 || - ehdr->e_shentsize != sizeof(Elf_Shdr)) - panic("Linux invalid vdso header.\n"); + MPASS(IS_ELF(*ehdr)); + MPASS(ehdr->e_ident[EI_CLASS] == ELF_TARG_CLASS); + MPASS(ehdr->e_ident[EI_DATA] == ELF_TARG_DATA); + MPASS(ehdr->e_ident[EI_VERSION] == EV_CURRENT); + MPASS(ehdr->e_shentsize == sizeof(Elf_Shdr)); + MPASS(ehdr->e_shoff != 0); + MPASS(ehdr->e_type == ET_DYN); - if (ehdr->e_type != ET_DYN) - panic("Linux invalid vdso header.\n"); + shdr = (const Elf_Shdr *)(base + ehdr->e_shoff); - shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff); - - __elfN(symtabindex) = -1; - __elfN(symstrindex) = -1; + dsym = NULL; for (i = 0; i < ehdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; if (shdr[i].sh_type == SHT_DYNSYM) { - __elfN(symtabindex) = i; - __elfN(symstrindex) = shdr[i].sh_link; + dsym = (Elf_Sym *)(base + shdr[i].sh_offset); + strtab = base + shdr[shdr[i].sh_link].sh_offset; + symcnt = shdr[i].sh_size / sizeof(*dsym); + break; } } - - if (__elfN(symtabindex) == -1 || __elfN(symstrindex) == -1) - panic("Linux invalid vdso header.\n"); + MPASS(dsym != NULL); ehdr->e_ident[EI_OSABI] = ELFOSABI_LINUX; -} -void -__elfN(linux_vdso_reloc)(struct sysentvec *sv) -{ - struct linux_vdso_sym *lsym; - Elf_Ehdr *ehdr; - Elf_Phdr *phdr; - Elf_Shdr *shdr; - Elf_Dyn *dyn; - Elf_Sym *sym; - int i, j, symcnt; - - ehdr = (Elf_Ehdr *) sv->sv_sigcode; - - /* Adjust our so relative to the sigcode_base */ - if (sv->sv_shared_page_base != 0) { - ehdr->e_entry += sv->sv_shared_page_base; - phdr = (Elf_Phdr *)((caddr_t)ehdr + ehdr->e_phoff); - - /* phdrs */ - for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += sv->sv_shared_page_base; - if (phdr[i].p_type != PT_DYNAMIC) - continue; - dyn = (Elf_Dyn *)((caddr_t)ehdr + phdr[i].p_offset); - for(; dyn->d_tag != DT_NULL; dyn++) { - switch (dyn->d_tag) { - case DT_PLTGOT: - case DT_HASH: - case DT_STRTAB: - case DT_SYMTAB: - case DT_RELA: - case DT_INIT: - case DT_FINI: - case DT_REL: - case DT_DEBUG: - case DT_JMPREL: - case DT_VERSYM: - case DT_VERDEF: - case DT_VERNEED: - case DT_ADDRRNGLO ... DT_ADDRRNGHI: - dyn->d_un.d_ptr += sv->sv_shared_page_base; - break; - case DT_ENCODING ... DT_LOOS-1: - case DT_LOOS ... DT_HIOS: - if (dyn->d_tag >= DT_ENCODING && - (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += sv->sv_shared_page_base; - break; - default: - break; - } - } - } + /* + * VDSO is readonly mapped to the process VA and + * can't be relocated by rtld. + */ + SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) { + for (i = 0, sym = dsym; i < symcnt; i++, sym++) { + symname = strtab + sym->st_name; + if (strncmp(lsym->symname, symname, lsym->size) == 0) { + sym->st_value += offset; + *lsym->ptr = sym->st_value; + break; - /* sections */ - shdr = (Elf_Shdr *)((caddr_t)ehdr + ehdr->e_shoff); - for(i = 0; i < ehdr->e_shnum; i++) { - if (!(shdr[i].sh_flags & SHF_ALLOC)) - continue; - shdr[i].sh_addr += sv->sv_shared_page_base; - if (shdr[i].sh_type != SHT_SYMTAB && - shdr[i].sh_type != SHT_DYNSYM) - continue; - - sym = (Elf_Sym *)((caddr_t)ehdr + shdr[i].sh_offset); - symcnt = shdr[i].sh_size / sizeof(*sym); - - for(j = 0; j < symcnt; j++, sym++) { - if (sym->st_shndx == SHN_UNDEF || - sym->st_shndx == SHN_ABS) - continue; - sym->st_value += sv->sv_shared_page_base; } } } - - SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) - __elfN(linux_vdso_lookup)(ehdr, lsym); } -static void -__elfN(linux_vdso_lookup)(Elf_Ehdr *ehdr, struct linux_vdso_sym *vsym) +int +linux_map_vdso(struct proc *p, vm_object_t obj, vm_offset_t base, + vm_offset_t size, struct image_params *imgp) { - vm_offset_t strtab, symname; - uint32_t symcnt; - Elf_Shdr *shdr; - int i; - - shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff); - - strtab = (vm_offset_t)((caddr_t)ehdr + - shdr[__elfN(symstrindex)].sh_offset); - Elf_Sym *sym = (Elf_Sym *)((caddr_t)ehdr + - shdr[__elfN(symtabindex)].sh_offset); - symcnt = shdr[__elfN(symtabindex)].sh_size / sizeof(*sym); - - for (i = 0; i < symcnt; ++i, ++sym) { - symname = strtab + sym->st_name; - if (strncmp(vsym->symname, (char *)symname, vsym->size) == 0) { - *vsym->ptr = (uintptr_t)sym->st_value; - break; - } + struct vmspace *vmspace; + vm_map_t map; + int error; + + MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX); + MPASS(obj != NULL); + + vmspace = p->p_vmspace; + map = &vmspace->vm_map; + + vm_object_reference(obj); + error = vm_map_fixed(map, obj, 0, base, size, + VM_PROT_READ | VM_PROT_EXECUTE, + VM_PROT_READ | VM_PROT_EXECUTE, + MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + return (vm_mmap_to_errno(error)); } + return (0); } diff --git a/sys/compat/linux/linux_vdso.h b/sys/compat/linux/linux_vdso.h index 073c51696387..870d0dd97fa2 100644 --- a/sys/compat/linux/linux_vdso.h +++ b/sys/compat/linux/linux_vdso.h @@ -1,65 +1,67 @@ /*- * Copyright (c) 2013 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_VDSO_H_ #define _LINUX_VDSO_H_ #include struct linux_vdso_sym { SLIST_ENTRY(linux_vdso_sym) sym; uint32_t size; uintptr_t * ptr; char symname[]; }; -vm_object_t __elfN(linux_shared_page_init)(char **); -void __elfN(linux_shared_page_fini)(vm_object_t, void *); -void __elfN(linux_vdso_fixup)(struct sysentvec *); -void __elfN(linux_vdso_reloc)(struct sysentvec *); +vm_object_t __elfN(linux_shared_page_init)(char **, vm_size_t); +void __elfN(linux_shared_page_fini)(vm_object_t, void *, vm_size_t); +void __elfN(linux_vdso_fixup)(char *, vm_offset_t); void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *); +int linux_map_vdso(struct proc *, vm_object_t, vm_offset_t, + vm_offset_t, struct image_params *); + #define LINUX_VDSO_SYM_INTPTR(name) \ uintptr_t name; \ LINUX_VDSO_SYM_DEFINE(name) #define LINUX_VDSO_SYM_CHAR(name) \ const char * name; \ LINUX_VDSO_SYM_DEFINE(name) #define LINUX_VDSO_SYM_DEFINE(name) \ static struct linux_vdso_sym name ## sym = { \ .symname = #name, \ .size = sizeof(#name), \ .ptr = (uintptr_t *)&name \ }; \ SYSINIT(__elfN(name ## _sym_init), SI_SUB_EXEC, \ SI_ORDER_FIRST, __elfN(linux_vdso_sym_init), &name ## sym); \ struct __hack #endif /* _LINUX_VDSO_H_ */ diff --git a/sys/compat/linux/linux_vdso_gtod.inc b/sys/compat/linux/linux_vdso_gtod.inc new file mode 100644 index 000000000000..a90b7dc8efdf --- /dev/null +++ b/sys/compat/linux/linux_vdso_gtod.inc @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +static int +__vdso_native_to_linux_timespec(struct l_timespec *lts, + struct timespec *nts) +{ + +#ifdef COMPAT_LINUX32 + if (nts->tv_sec > INT_MAX || nts->tv_sec < INT_MIN) + return (LINUX_EOVERFLOW); +#endif + lts->tv_sec = nts->tv_sec; + lts->tv_nsec = nts->tv_nsec; + return (0); +} + +static int +__vdso_native_to_linux_timeval(l_timeval *ltv, + struct timeval *ntv) +{ + +#ifdef COMPAT_LINUX32 + if (ntv->tv_sec > INT_MAX || ntv->tv_sec < INT_MIN) + return (LINUX_EOVERFLOW); +#endif + ltv->tv_sec = ntv->tv_sec; + ltv->tv_usec = ntv->tv_usec; + return (0); +} + + +#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) +static int +__vdso_native_to_linux_timespec64(struct l_timespec64 *lts, + struct timespec *nts) +{ + + lts->tv_sec = nts->tv_sec; + lts->tv_nsec = nts->tv_nsec; + return (0); +} +#endif + +static int +__vdso_linux_to_native_clockid(clockid_t *n, clockid_t l) +{ + + switch (l) { + case LINUX_CLOCK_REALTIME: + *n = CLOCK_REALTIME; + break; + case LINUX_CLOCK_MONOTONIC: + *n = CLOCK_MONOTONIC; + break; + case LINUX_CLOCK_REALTIME_COARSE: + *n = CLOCK_REALTIME_FAST; + break; + case LINUX_CLOCK_MONOTONIC_COARSE: + case LINUX_CLOCK_MONOTONIC_RAW: + *n = CLOCK_MONOTONIC_FAST; + break; + case LINUX_CLOCK_BOOTTIME: + *n = CLOCK_UPTIME; + break; + default: + return (LINUX_EINVAL); + } + return (0); +} + +/* + * The code below adapted from + * lib/libc/sys/__vdso_gettimeofday.c + */ + +static inline void +__vdso_gettimekeep(struct vdso_timekeep **tk) +{ + + *tk = (struct vdso_timekeep *)kern_timekeep_base; +} + +static int +tc_delta(const struct vdso_timehands *th, u_int *delta) +{ + int error; + u_int tc; + + error = __vdso_gettc(th, &tc); + if (error == 0) + *delta = (tc - th->th_offset_count) & th->th_counter_mask; + return (error); +} + +/* + * Calculate the absolute or boot-relative time from the + * machine-specific fast timecounter and the published timehands + * structure read from the shared page. + * + * The lockless reading scheme is similar to the one used to read the + * in-kernel timehands, see sys/kern/kern_tc.c:binuptime(). This code + * is based on the kernel implementation. + */ +static int +freebsd_binuptime(struct bintime *bt, struct vdso_timekeep *tk, bool abs) +{ + struct vdso_timehands *th; + uint32_t curr, gen; + uint64_t scale, x; + u_int delta, scale_bits; + int error; + + do { + if (!tk->tk_enabled) + return (ENOSYS); + + curr = atomic_load_acq_32(&tk->tk_current); + th = &tk->tk_th[curr]; + gen = atomic_load_acq_32(&th->th_gen); + *bt = th->th_offset; + error = tc_delta(th, &delta); + if (error == EAGAIN) + continue; + if (error != 0) + return (error); + scale = th->th_scale; +#ifdef _LP64 + scale_bits = ffsl(scale); +#else + scale_bits = ffsll(scale); +#endif + if (__predict_false(scale_bits + fls(delta) > 63)) { + x = (scale >> 32) * delta; + scale &= 0xffffffff; + bt->sec += x >> 32; + bintime_addx(bt, x << 32); + } + bintime_addx(bt, scale * delta); + if (abs) + bintime_add(bt, &th->th_boottime); + + /* + * Ensure that the load of th_offset is completed + * before the load of th_gen. + */ + atomic_thread_fence_acq(); + } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen); + return (0); +} + +static int +freebsd_getnanouptime(struct bintime *bt, struct vdso_timekeep *tk) +{ + struct vdso_timehands *th; + uint32_t curr, gen; + + do { + if (!tk->tk_enabled) + return (ENOSYS); + + curr = atomic_load_acq_32(&tk->tk_current); + th = &tk->tk_th[curr]; + gen = atomic_load_acq_32(&th->th_gen); + *bt = th->th_offset; + + /* + * Ensure that the load of th_offset is completed + * before the load of th_gen. + */ + atomic_thread_fence_acq(); + } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen); + return (0); +} + +static int +freebsd_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + struct vdso_timekeep *tk; + struct bintime bt; + int error; + + if (tz != NULL) + return (ENOSYS); + __vdso_gettimekeep(&tk); + if (tk == NULL) + return (ENOSYS); + if (tk->tk_ver != VDSO_TK_VER_CURR) + return (ENOSYS); + error = freebsd_binuptime(&bt, tk, true); + if (error == 0) + bintime2timeval(&bt, tv); + return (error); +} + +static int +freebsd_clock_gettime(clockid_t clock_id, struct timespec *ts) +{ + struct vdso_timekeep *tk; + struct bintime bt; + int error; + + __vdso_gettimekeep(&tk); + if (tk == NULL) + return (ENOSYS); + if (tk->tk_ver != VDSO_TK_VER_CURR) + return (ENOSYS); + switch (clock_id) { + case CLOCK_REALTIME: + case CLOCK_REALTIME_PRECISE: + case CLOCK_REALTIME_FAST: + error = freebsd_binuptime(&bt, tk, true); + break; + case CLOCK_MONOTONIC: + case CLOCK_MONOTONIC_PRECISE: + case CLOCK_UPTIME: + case CLOCK_UPTIME_PRECISE: + error = freebsd_binuptime(&bt, tk, false); + break; + case CLOCK_MONOTONIC_FAST: + case CLOCK_UPTIME_FAST: + error = freebsd_getnanouptime(&bt, tk); + break; + default: + error = ENOSYS; + break; + } + if (error == 0) + bintime2timespec(&bt, ts); + return (error); +} + +/* + * Linux vDSO interfaces + * + */ +int +__vdso_clock_gettime(clockid_t clock_id, struct l_timespec *lts) +{ + struct timespec ts; + clockid_t which; + int error; + + error = __vdso_linux_to_native_clockid(&which, clock_id); + if (error != 0) + return (__vdso_clock_gettime_fallback(clock_id, lts)); + error = freebsd_clock_gettime(which, &ts); + if (error == 0) + return (-__vdso_native_to_linux_timespec(lts, &ts)); + else + return (__vdso_clock_gettime_fallback(clock_id, lts)); +} + +int +__vdso_gettimeofday(l_timeval *ltv, struct timezone *tz) +{ + struct timeval tv; + int error; + + error = freebsd_gettimeofday(&tv, tz); + if (error != 0) + return (__vdso_gettimeofday_fallback(ltv, tz)); + return (-__vdso_native_to_linux_timeval(ltv, &tv)); +} + +int +__vdso_clock_getres(clockid_t clock_id, struct l_timespec *lts) +{ + + return (__vdso_clock_getres_fallback(clock_id, lts)); +} + +#if defined(__i386__) || defined(COMPAT_LINUX32) +int +__vdso_clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts) +{ + struct timespec ts; + clockid_t which; + int error; + + error = __vdso_linux_to_native_clockid(&which, clock_id); + if (error != 0) + return (__vdso_clock_gettime64_fallback(clock_id, lts)); + error = freebsd_clock_gettime(which, &ts); + if (error == 0) + return(-__vdso_native_to_linux_timespec64(lts, &ts)); + else + return(__vdso_clock_gettime64_fallback(clock_id, lts)); +} + +int clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts) + __attribute__((weak, alias("__vdso_clock_gettime64"))); +#endif + +#if defined(__amd64__) && !defined(COMPAT_LINUX32) +int +__vdso_getcpu(uint32_t *cpu, uint32_t *node, void *cache) +{ + + return (__vdso_getcpu_fallback(cpu, node, cache)); +} +#endif + +#if defined(__i386__) || defined(__amd64__) +int +__vdso_time(long *tm) +{ + + return (__vdso_time_fallback(tm)); +} +#endif diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h index bab89cb5a43a..af7bb554b340 100644 --- a/sys/i386/linux/linux.h +++ b/sys/i386/linux/linux.h @@ -1,574 +1,571 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * $FreeBSD$ */ #ifndef _I386_LINUX_H_ #define _I386_LINUX_H_ #include #include #include #define LINUX_LEGACY_SYSCALLS #define LINUX_DTRACE linuxulator -#define LINUX_SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) -#define LINUX_USRSTACK LINUX_SHAREDPAGE - /* * Provide a separate set of types for the Linux types. */ typedef int l_int; typedef int32_t l_long; typedef int64_t l_longlong; typedef short l_short; typedef unsigned int l_uint; typedef uint32_t l_ulong; typedef uint64_t l_ulonglong; typedef unsigned short l_ushort; typedef char *l_caddr_t; typedef l_ulong l_uintptr_t; typedef l_long l_clock_t; typedef l_int l_daddr_t; typedef l_ushort l_dev_t; typedef l_uint l_gid_t; typedef l_ushort l_gid16_t; typedef l_ulong l_ino_t; typedef l_int l_key_t; typedef l_longlong l_loff_t; typedef l_ushort l_mode_t; typedef l_long l_off_t; typedef l_int l_pid_t; typedef l_uint l_size_t; typedef l_long l_suseconds_t; typedef l_long l_time_t; typedef l_longlong l_time64_t; typedef l_uint l_uid_t; typedef l_ushort l_uid16_t; typedef l_int l_timer_t; typedef l_int l_mqd_t; typedef l_ulong l_fd_mask; typedef struct { l_int val[2]; } l_fsid_t; typedef struct { l_time_t tv_sec; l_suseconds_t tv_usec; } l_timeval; #define l_fd_set fd_set /* * Miscellaneous */ #define LINUX_AT_COUNT 20 /* Count of used aux entry types. * Keep this synchronized with * linux_fixup_elf() code. */ struct l___sysctl_args { l_int *name; l_int nlen; void *oldval; l_size_t *oldlenp; void *newval; l_size_t newlen; l_ulong __spare[4]; }; /* Resource limits */ #define LINUX_RLIMIT_CPU 0 #define LINUX_RLIMIT_FSIZE 1 #define LINUX_RLIMIT_DATA 2 #define LINUX_RLIMIT_STACK 3 #define LINUX_RLIMIT_CORE 4 #define LINUX_RLIMIT_RSS 5 #define LINUX_RLIMIT_NPROC 6 #define LINUX_RLIMIT_NOFILE 7 #define LINUX_RLIMIT_MEMLOCK 8 #define LINUX_RLIMIT_AS 9 /* Address space limit */ #define LINUX_RLIM_NLIMITS 10 struct l_rlimit { l_ulong rlim_cur; l_ulong rlim_max; }; struct l_mmap_argv { l_uintptr_t addr; l_size_t len; l_int prot; l_int flags; l_int fd; l_off_t pgoff; }; /* * stat family of syscalls */ struct l_timespec { l_time_t tv_sec; l_long tv_nsec; }; /* __kernel_timespec */ struct l_timespec64 { l_time64_t tv_sec; l_longlong tv_nsec; }; struct l_newstat { l_ushort st_dev; l_ushort __pad1; l_ulong st_ino; l_ushort st_mode; l_ushort st_nlink; l_ushort st_uid; l_ushort st_gid; l_ushort st_rdev; l_ushort __pad2; l_ulong st_size; l_ulong st_blksize; l_ulong st_blocks; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_ulong __unused4; l_ulong __unused5; }; struct l_stat { l_ushort st_dev; l_ulong st_ino; l_ushort st_mode; l_ushort st_nlink; l_ushort st_uid; l_ushort st_gid; l_ushort st_rdev; l_long st_size; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_long st_blksize; l_long st_blocks; l_ulong st_flags; l_ulong st_gen; }; struct l_stat64 { l_ushort st_dev; u_char __pad0[10]; l_ulong __st_ino; l_uint st_mode; l_uint st_nlink; l_ulong st_uid; l_ulong st_gid; l_ushort st_rdev; u_char __pad3[10]; l_longlong st_size; l_ulong st_blksize; l_ulong st_blocks; l_ulong __pad4; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_ulonglong st_ino; }; struct l_statfs64 { l_int f_type; l_int f_bsize; uint64_t f_blocks; uint64_t f_bfree; uint64_t f_bavail; uint64_t f_files; uint64_t f_ffree; l_fsid_t f_fsid; l_int f_namelen; l_int f_frsize; l_int f_flags; l_int f_spare[4]; }; #define LINUX_NSIG_WORDS 2 /* sigaction flags */ #define LINUX_SA_NOCLDSTOP 0x00000001 #define LINUX_SA_NOCLDWAIT 0x00000002 #define LINUX_SA_SIGINFO 0x00000004 #define LINUX_SA_RESTORER 0x04000000 #define LINUX_SA_ONSTACK 0x08000000 #define LINUX_SA_RESTART 0x10000000 #define LINUX_SA_INTERRUPT 0x20000000 #define LINUX_SA_NOMASK 0x40000000 #define LINUX_SA_ONESHOT 0x80000000 /* sigprocmask actions */ #define LINUX_SIG_BLOCK 0 #define LINUX_SIG_UNBLOCK 1 #define LINUX_SIG_SETMASK 2 /* sigaltstack */ #define LINUX_MINSIGSTKSZ 2048 typedef void (*l_handler_t)(l_int); typedef l_ulong l_osigset_t; typedef struct { l_handler_t lsa_handler; l_osigset_t lsa_mask; l_ulong lsa_flags; void (*lsa_restorer)(void); } l_osigaction_t; typedef struct { l_handler_t lsa_handler; l_ulong lsa_flags; void (*lsa_restorer)(void); l_sigset_t lsa_mask; } l_sigaction_t; typedef struct { void *ss_sp; l_int ss_flags; l_size_t ss_size; } l_stack_t; /* The Linux sigcontext, pretty much a standard 386 trapframe. */ struct l_sigcontext { l_int sc_gs; l_int sc_fs; l_int sc_es; l_int sc_ds; l_int sc_edi; l_int sc_esi; l_int sc_ebp; l_int sc_esp; l_int sc_ebx; l_int sc_edx; l_int sc_ecx; l_int sc_eax; l_int sc_trapno; l_int sc_err; l_int sc_eip; l_int sc_cs; l_int sc_eflags; l_int sc_esp_at_signal; l_int sc_ss; l_int sc_387; l_int sc_mask; l_int sc_cr2; }; struct l_ucontext { l_ulong uc_flags; void *uc_link; l_stack_t uc_stack; struct l_sigcontext uc_mcontext; l_sigset_t uc_sigmask; }; #define LINUX_SI_MAX_SIZE 128 #define LINUX_SI_PAD_SIZE ((LINUX_SI_MAX_SIZE/sizeof(l_int)) - 3) typedef union l_sigval { l_int sival_int; l_uintptr_t sival_ptr; } l_sigval_t; typedef struct l_siginfo { l_int lsi_signo; l_int lsi_errno; l_int lsi_code; union { l_int _pad[LINUX_SI_PAD_SIZE]; struct { l_pid_t _pid; l_uid_t _uid; } _kill; struct { l_timer_t _tid; l_int _overrun; char _pad[sizeof(l_uid_t) - sizeof(l_int)]; l_sigval_t _sigval; l_int _sys_private; } _timer; struct { l_pid_t _pid; /* sender's pid */ l_uid_t _uid; /* sender's uid */ l_sigval_t _sigval; } _rt; struct { l_pid_t _pid; /* which child */ l_uid_t _uid; /* sender's uid */ l_int _status; /* exit code */ l_clock_t _utime; l_clock_t _stime; } _sigchld; struct { l_uintptr_t _addr; /* Faulting insn/memory ref. */ } _sigfault; struct { l_long _band; /* POLL_IN,POLL_OUT,POLL_MSG */ l_int _fd; } _sigpoll; } _sifields; } l_siginfo_t; #define lsi_pid _sifields._kill._pid #define lsi_uid _sifields._kill._uid #define lsi_tid _sifields._timer._tid #define lsi_overrun _sifields._timer._overrun #define lsi_sys_private _sifields._timer._sys_private #define lsi_status _sifields._sigchld._status #define lsi_utime _sifields._sigchld._utime #define lsi_stime _sifields._sigchld._stime #define lsi_value _sifields._rt._sigval #define lsi_int _sifields._rt._sigval.sival_int #define lsi_ptr _sifields._rt._sigval.sival_ptr #define lsi_addr _sifields._sigfault._addr #define lsi_band _sifields._sigpoll._band #define lsi_fd _sifields._sigpoll._fd struct l_fpreg { u_int16_t significand[4]; u_int16_t exponent; }; struct l_fpxreg { u_int16_t significand[4]; u_int16_t exponent; u_int16_t padding[3]; }; struct l_xmmreg { u_int32_t element[4]; }; struct l_fpstate { /* Regular FPU environment */ u_int32_t cw; u_int32_t sw; u_int32_t tag; u_int32_t ipoff; u_int32_t cssel; u_int32_t dataoff; u_int32_t datasel; struct l_fpreg _st[8]; u_int16_t status; u_int16_t magic; /* 0xffff = regular FPU data */ /* FXSR FPU environment */ u_int32_t _fxsr_env[6]; /* env is ignored. */ u_int32_t mxcsr; u_int32_t reserved; struct l_fpxreg _fxsr_st[8]; /* reg data is ignored. */ struct l_xmmreg _xmm[8]; u_int32_t padding[56]; }; /* * We make the stack look like Linux expects it when calling a signal * handler, but use the BSD way of calling the handler and sigreturn(). * This means that we need to pass the pointer to the handler too. * It is appended to the frame to not interfere with the rest of it. */ struct l_sigframe { l_int sf_sig; struct l_sigcontext sf_sc; struct l_fpstate sf_fpstate; l_uint sf_extramask[LINUX_NSIG_WORDS-1]; l_handler_t sf_handler; }; struct l_rt_sigframe { l_int sf_sig; l_siginfo_t *sf_siginfo; struct l_ucontext *sf_ucontext; l_siginfo_t sf_si; struct l_ucontext sf_sc; l_handler_t sf_handler; }; extern struct sysentvec linux_sysvec; /* * arch specific open/fcntl flags */ #define LINUX_F_GETLK64 12 #define LINUX_F_SETLK64 13 #define LINUX_F_SETLKW64 14 union l_semun { l_int val; l_uintptr_t buf; l_ushort *array; l_uintptr_t __buf; l_uintptr_t __pad; }; struct l_ifmap { l_ulong mem_start; l_ulong mem_end; l_ushort base_addr; u_char irq; u_char dma; u_char port; }; struct l_ifreq { union { char ifrn_name[LINUX_IFNAMSIZ]; } ifr_ifrn; union { struct l_sockaddr ifru_addr; struct l_sockaddr ifru_dstaddr; struct l_sockaddr ifru_broadaddr; struct l_sockaddr ifru_netmask; struct l_sockaddr ifru_hwaddr; l_short ifru_flags[1]; l_int ifru_ivalue; l_int ifru_mtu; struct l_ifmap ifru_map; char ifru_slave[LINUX_IFNAMSIZ]; l_caddr_t ifru_data; } ifr_ifru; }; #define ifr_name ifr_ifrn.ifrn_name /* Interface name */ #define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ #define ifr_ifindex ifr_ifru.ifru_ivalue /* Interface index */ struct l_user_desc { l_uint entry_number; l_uint base_addr; l_uint limit; l_uint seg_32bit:1; l_uint contents:2; l_uint read_exec_only:1; l_uint limit_in_pages:1; l_uint seg_not_present:1; l_uint useable:1; }; struct l_desc_struct { unsigned long a, b; }; #define LINUX_LOWERWORD 0x0000ffff /* * Macros which does the same thing as those in Linux include/asm-um/ldt-i386.h. * These convert Linux user space descriptor to machine one. */ #define LINUX_LDT_entry_a(info) \ ((((info)->base_addr & LINUX_LOWERWORD) << 16) | \ ((info)->limit & LINUX_LOWERWORD)) #define LINUX_ENTRY_B_READ_EXEC_ONLY 9 #define LINUX_ENTRY_B_CONTENTS 10 #define LINUX_ENTRY_B_SEG_NOT_PRESENT 15 #define LINUX_ENTRY_B_BASE_ADDR 16 #define LINUX_ENTRY_B_USEABLE 20 #define LINUX_ENTRY_B_SEG32BIT 22 #define LINUX_ENTRY_B_LIMIT 23 #define LINUX_LDT_entry_b(info) \ (((info)->base_addr & 0xff000000) | \ ((info)->limit & 0xf0000) | \ ((info)->contents << LINUX_ENTRY_B_CONTENTS) | \ (((info)->seg_not_present == 0) << LINUX_ENTRY_B_SEG_NOT_PRESENT) | \ (((info)->base_addr & 0x00ff0000) >> LINUX_ENTRY_B_BASE_ADDR) | \ (((info)->read_exec_only == 0) << LINUX_ENTRY_B_READ_EXEC_ONLY) | \ ((info)->seg_32bit << LINUX_ENTRY_B_SEG32BIT) | \ ((info)->useable << LINUX_ENTRY_B_USEABLE) | \ ((info)->limit_in_pages << LINUX_ENTRY_B_LIMIT) | 0x7000) #define LINUX_LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ (info)->seg_not_present == 1 && \ (info)->read_exec_only == 1 && \ (info)->seg_32bit == 0 && \ (info)->limit_in_pages == 0 && \ (info)->useable == 0) /* * Macros for converting segments. * They do the same as those in arch/i386/kernel/process.c in Linux. */ #define LINUX_GET_BASE(desc) \ ((((desc)->a >> 16) & LINUX_LOWERWORD) | \ (((desc)->b << 16) & 0x00ff0000) | \ ((desc)->b & 0xff000000)) #define LINUX_GET_LIMIT(desc) \ (((desc)->a & LINUX_LOWERWORD) | \ ((desc)->b & 0xf0000)) #define LINUX_GET_32BIT(desc) \ (((desc)->b >> LINUX_ENTRY_B_SEG32BIT) & 1) #define LINUX_GET_CONTENTS(desc) \ (((desc)->b >> LINUX_ENTRY_B_CONTENTS) & 3) #define LINUX_GET_WRITABLE(desc) \ (((desc)->b >> LINUX_ENTRY_B_READ_EXEC_ONLY) & 1) #define LINUX_GET_LIMIT_PAGES(desc) \ (((desc)->b >> LINUX_ENTRY_B_LIMIT) & 1) #define LINUX_GET_PRESENT(desc) \ (((desc)->b >> LINUX_ENTRY_B_SEG_NOT_PRESENT) & 1) #define LINUX_GET_USEABLE(desc) \ (((desc)->b >> LINUX_ENTRY_B_USEABLE) & 1) #define linux_copyout_rusage(r, u) copyout(r, u, sizeof(*r)) /* robust futexes */ struct linux_robust_list { struct linux_robust_list *next; }; struct linux_robust_list_head { struct linux_robust_list list; l_long futex_offset; struct linux_robust_list *pending_list; }; #endif /* !_I386_LINUX_H_ */ diff --git a/sys/i386/linux/linux_locore.asm b/sys/i386/linux/linux_locore.asm index 8b4cc201574f..4c5246bd5725 100644 --- a/sys/i386/linux/linux_locore.asm +++ b/sys/i386/linux/linux_locore.asm @@ -1,157 +1,157 @@ /* $FreeBSD$ */ #include "linux_assym.h" /* system definitions */ #include /* miscellaneous asm macros */ #include /* system call numbers */ #include "assym.inc" .data .globl linux_platform linux_platform: .asciz "i686" .text /* * To avoid excess stack frame the signal trampoline code emulates * the 'call' instruction. */ -ENTRY(linux_sigcode) +ENTRY(__kernel_sigreturn) movl %esp, %ebx /* preserve sigframe */ call .getip0 .getip0: popl %eax add $.startsigcode-.getip0, %eax /* ret address */ push %eax jmp *LINUX_SIGF_HANDLER(%ebx) .startsigcode: popl %eax /* gcc unwind code need this */ movl $LINUX_SYS_linux_sigreturn,%eax /* linux_sigreturn() */ int $0x80 /* enter kernel with args */ .endsigcode: 0: jmp 0b -ENTRY(linux_rt_sigcode) +ENTRY(__kernel_rt_sigreturn) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */ movl %esp, %edi call .getip1 .getip1: popl %eax add $.startrtsigcode-.getip1, %eax /* ret address */ push %eax jmp *LINUX_RT_SIGF_HANDLER(%edi) .startrtsigcode: movl $LINUX_SYS_linux_rt_sigreturn,%eax /* linux_rt_sigreturn() */ int $0x80 /* enter kernel with args */ .endrtsigcode: 0: jmp 0b -ENTRY(linux_vsyscall) +ENTRY(__kernel_vsyscall) .startvsyscall: int $0x80 ret .endvsyscall: #if 0 .section .note.Linux, "a",@note .long 2f - 1f /* namesz */ .balign 4 .long 4f - 3f /* descsz */ .long 0 1: .asciz "Linux" 2: .balign 4 3: .long LINUX_VERSION_CODE 4: .balign 4 .previous #endif #define do_cfa_expr(offset) \ .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ .uleb128 11f-10f; /* length */ \ 10: .byte 0x74; /* DW_OP_breg4 */ \ .sleb128 offset; /* offset */ \ .byte 0x06; /* DW_OP_deref */ \ 11: /* CIE */ .section .eh_frame,"a",@progbits .LSTARTFRAMEDLSI1: .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 .LSTARTCIEDLSI1: .long 0 /* CIE ID */ .byte 1 /* Version number */ .string "zRS" /* NULL-terminated * augmentation string */ .uleb128 1 /* Code alignment factor */ .sleb128 -4 /* Data alignment factor */ .byte 8 /* Return address * register column */ .uleb128 1 /* Augmentation value length */ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ .byte 0 /* DW_CFA_nop */ .align 4 .LENDCIEDLSI1: /* FDE */ .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ .LSTARTFDEDLSI1: .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ .long .startsigcode-. /* PC-relative start address */ .long .endsigcode-.startsigcode .uleb128 0 /* Augmentation */ do_cfa_expr(LINUX_SIGF_SC-8) .align 4 .LENDFDEDLSI1: .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ .LSTARTFDEDLSI2: .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ .long .startrtsigcode-. /* PC-relative start address */ .long .endrtsigcode-.startrtsigcode .uleb128 0 /* Augmentation */ do_cfa_expr(LINUX_RT_SIGF_SC-4+LINUX_SC_ESP) .align 4 .LENDFDEDLSI2: .previous .section .eh_frame,"a",@progbits .LSTARTFRAMEDLSI2: .long .LENDCIEDLSI2-.LSTARTCIEDLSI2 .LSTARTCIEDLSI2: .long 0 /* CIE ID */ .byte 1 /* Version number */ .string "zR" /* NULL-terminated * augmentation string */ .uleb128 1 /* Code alignment factor */ .sleb128 -4 /* Data alignment factor */ .byte 8 /* Return address register column */ .uleb128 1 /* Augmentation value length */ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ .byte 0x0c /* DW_CFA_def_cfa */ .uleb128 4 .uleb128 4 .byte 0x88 /* DW_CFA_offset, column 0x8 */ .uleb128 1 .align 4 .LENDCIEDLSI2: .long .LENDFDEDLSI3-.LSTARTFDEDLSI3 /* Length FDE */ .LSTARTFDEDLSI3: .long .LSTARTFDEDLSI3-.LSTARTFRAMEDLSI2 /* CIE pointer */ .long .startvsyscall-. /* PC-relative start address */ .long .endvsyscall-.startvsyscall .uleb128 0 .align 4 .LENDFDEDLSI3: .previous diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index 048bc6dffdca..f630a0e1f659 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -1,1074 +1,1171 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK LINUX_SHAREDPAGE #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(uintptr_t *stack_base, struct image_params *iparams); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)args->argc + 1); base--; suword(base, (intptr_t)envp); base--; suword(base, (intptr_t)argv); base--; suword(base, imgp->args->argc); *stack_base = (uintptr_t)base; return (0); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { struct proc *p; Elf32_Auxargs *args; Elf32_Auxinfo *argarray, *pos; struct ps_strings *arginfo; int error, issetugid; p = imgp->proc; issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; args = (Elf32_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { register_t *base; base = (register_t *)*stack_base; base--; if (suword(base, (register_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copied from kern/kern_exec.c */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* Calculate string base and vector table pointers. */ p = imgp->proc; if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp; frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; - regs->tf_eip = linux_rt_sigcode; + regs->tf_eip = __kernel_rt_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig, code; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = ksi->ksi_signo; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_esp = regs->tf_esp; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; - regs->tf_eip = linux_sigcode; + regs->tf_eip = __kernel_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; sigset_t bmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* Call sigaltstack & ignore results. */ lss = &uc.uc_stack; ss.ss_sp = lss->ss_sp; ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_eax; sa->original_code = sa->code; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_eax = bsd_to_linux_errno(error); } } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel. */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32 | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP | - SV_SIG_DISCIGN | SV_SIG_WAITNDQ, + SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { + int error = 0; - linux_on_exec(p, imgp); - return (0); + if (SV_PROC_FLAG(p, SV_SHP) != 0) + error = linux_map_vdso(p, linux_vdso_obj, + linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); } static void -linux_vdso_install(void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX_VDSOPAGE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Shdr *shdr; + const Elf_Rel *rel; + const Elf_Ehdr *ehdr; + Elf_Addr *where; + Elf_Size rtype, symidx; + Elf_Addr addr, addend; + int i, relcnt; + + MPASS(offset != 0); + + relcnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); + relcnt = shdr[i].sh_size / sizeof(*rel); + break; + case SHT_RELA: + printf("Linux i386 vDSO: unexpected Rela section\n"); + break; + } + } + + for (i = 0; i < relcnt; i++, rel++) { + where = (Elf_Addr *)(mapping + rel->r_offset); + addend = *where; + rtype = ELF_R_TYPE(rel->r_info); + symidx = ELF_R_SYM(rel->r_info); + + switch (rtype) { + case R_386_NONE: /* none */ + break; + + case R_386_RELATIVE: /* B + A */ + addr = (Elf_Addr)PTROUT(offset + addend); + if (*where != addr) + *where = addr; + break; + + case R_386_IRELATIVE: + printf("Linux i386 vDSO: unexpected ifunc relocation, " + "symbol index %d\n", symidx); + break; + default: + printf("Linux i386 vDSO: unexpected relocation type %d, " + "symbol index %d\n", rtype, symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_dev_shm_create(); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); linux_dev_shm_destroy(); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/i386/linux/linux_vdso.lds.s b/sys/i386/linux/linux_vdso.lds.s index dcb61cf5e06f..0a392e6380b6 100644 --- a/sys/i386/linux/linux_vdso.lds.s +++ b/sys/i386/linux/linux_vdso.lds.s @@ -1,65 +1,80 @@ /* * Linker script for 32-bit vDSO. * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S * and arch/x86/vdso/vdso32/vdso32.lds.S * * $FreeBSD$ */ SECTIONS { . = . + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .note : { *(.note.*) } :text :note .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text .data : { *(.data*) *(.sdata*) *(.got.plt) *(.got) *(.gnu.linkonce.d.*) *(.bss*) *(.dynbss*) *(.gnu.linkonce.b.*) } .altinstructions : { *(.altinstructions) } .altinstr_replacement : { *(.altinstr_replacement) } . = ALIGN(0x100); .text : { *(.text*) } :text =0x90909090 } PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ eh_frame_hdr PT_GNU_EH_FRAME; } -ENTRY(linux_vsyscall); - VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; + }; + LINUX_2.5 { global: - linux_vsyscall; - linux_sigcode; - linux_rt_sigcode; + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; + + LINUX_0.0 { + global: + linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } diff --git a/sys/i386/linux/linux_vdso_gtod.c b/sys/i386/linux/linux_vdso_gtod.c new file mode 100644 index 000000000000..b1e4a4620ee4 --- /dev/null +++ b/sys/i386/linux/linux_vdso_gtod.c @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_write), "b"(fd), "c"(buf), "d"(size) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_gettimeofday), "b"(tv), "c"(tz) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_getres), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_time), "b"(tm) + : "cc", "memory" + ); + return (res); +} + +#include diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile index b2a5816c6919..1304c2d91fd8 100644 --- a/sys/modules/linux/Makefile +++ b/sys/modules/linux/Makefile @@ -1,94 +1,114 @@ # $FreeBSD$ .if ${MACHINE_CPUARCH} == "amd64" SFX= 32 CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 .endif .PATH: ${SRCTOP}/sys/compat/linux ${SRCTOP}/sys/${MACHINE_CPUARCH}/linux${SFX} .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" .PATH: ${SRCTOP}/sys/x86/linux .endif -VDSO= linux${SFX}_vdso - KMOD= linux SRCS= linux_fork.c linux${SFX}_dummy_machdep.c linux_file.c linux_event.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ linux${SFX}_machdep.c linux_misc.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \ linux${SFX}_sysvec.c linux_uid16.c linux_time.c \ linux_timer.c linux_vdso.c \ opt_inet6.h opt_compat.h opt_posix.h opt_usb.h vnode_if.h \ device_if.h bus_if.h .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" -SRCS+= linux_dummy_x86.c +SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c +VDSODEPS=linux_vdso_gettc_x86.inc .endif .if ${MACHINE_CPUARCH} == "amd64" SRCS+= linux${SFX}_support.s SRCS+= linux_elf32.c .else SRCS+= linux_copyout.c .endif DPSRCS= assym.inc linux${SFX}_genassym.c # XXX: for assym.inc SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h .if ${MACHINE_CPUARCH} == "i386" SRCS+= opt_apic.h .endif -OBJS= ${VDSO}.so +OBJS= linux${SFX}_vdso.so .if ${MACHINE_CPUARCH} == "i386" SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c linux_mmap.c \ linux_dummy.c linux_emul.c linux_errno.c opt_cpu.h linux.c .endif .if ${MACHINE_CPUARCH} == "i386" EXPORT_SYMS= EXPORT_SYMS+= linux_emul_path EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease EXPORT_SYMS+= linux_ioctl_register_handler EXPORT_SYMS+= linux_ioctl_unregister_handler .endif CLEANFILES= linux${SFX}_assym.h linux${SFX}_genassym.o linux${SFX}_locore.o \ - genassym.o + genassym.o linux${SFX}_vdso_gtod.o linux${SFX}_vdso.so.o linux${SFX}_assym.h: linux${SFX}_genassym.o sh ${SYSDIR}/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET} +.if ${MACHINE_CPUARCH} == "amd64" +VDSOFLAGS=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 -m32 +.endif + linux${SFX}_locore.o: linux${SFX}_assym.h assym.inc - ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -m32 -shared -s \ - -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -nostdinc -nostdlib \ - -fno-omit-frame-pointer -fPIC \ - -Wl,-T${SRCTOP}/sys/${MACHINE_CPUARCH}/linux${SFX}/${VDSO}.lds.s \ - -Wl,-soname=${VDSO}.so.1,--eh-frame-hdr,-warn-common \ + ${CC} -c -x assembler-with-cpp -DLOCORE -fPIC -pipe -O2 -Werror \ + -msoft-float -mregparm=0 \ + -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} +linux${SFX}_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS} + ${CC} -c -fPIC -pipe -O2 -Werror -msoft-float -mregparm=0 \ + -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ + ${.IMPSRC} -o ${.TARGET} + +linux${SFX}_vdso.so.o: linux${SFX}_locore.o linux${SFX}_vdso_gtod.o + ${LD} -m elf_i386 --shared --eh-frame-hdr -soname=linux-gate.so.1 \ + --no-undefined --hash-style=both -warn-common -nostdlib \ + --strip-debug -s --build-id=sha1 --Bsymbolic \ + -T${SRCTOP}/sys/${MACHINE}/linux${SFX}/linux${SFX}_vdso.lds.s \ + -o ${.TARGET} ${.ALLSRC:M*.o} + +.if ${MACHINE_CPUARCH} == "amd64" +OBJCOPY_TARGET=--output-target elf64-x86-64-freebsd --binary-architecture i386 +.elif ${MACHINE_CPUARCH} == "i386" +OBJCOPY_TARGET=--output-target elf32-i386-freebsd --binary-architecture i386 +.else +.error ${MACHINE_CPUARCH} not yet supported by linux +.endif + +linux${SFX}_vdso.so: linux${SFX}_vdso.so.o + ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \ + linux${SFX}_vdso.so.o ${.TARGET} + ${STRIPBIN} -N _binary_linux${SFX}_vdso_so_o_size ${.TARGET} + .if ${MACHINE_CPUARCH} == "amd64" linux${SFX}_support.o: linux${SFX}_assym.h assym.inc ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} - -${VDSO}.so: linux${SFX}_locore.o - ${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd \ - --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux${SFX}_locore_o_size ${.TARGET} -.else -${VDSO}.so: linux${SFX}_locore.o - ${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd \ - --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET} .endif linux${SFX}_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense .endif .include diff --git a/sys/modules/linux64/Makefile b/sys/modules/linux64/Makefile index 5f3f0fd03574..b85203a9ce93 100644 --- a/sys/modules/linux64/Makefile +++ b/sys/modules/linux64/Makefile @@ -1,65 +1,88 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/compat/linux ${SRCTOP}/sys/${MACHINE}/linux .if ${MACHINE_CPUARCH} == "amd64" .PATH: ${SRCTOP}/sys/x86/linux .endif -VDSO= linux_vdso - KMOD= linux64 SRCS= linux_elf64.c linux_fork.c linux_dummy_machdep.c linux_file.c \ linux_event.c linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ linux_machdep.c linux_misc.c linux_ptrace.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux_sysent.c \ linux_sysvec.c linux_time.c linux_vdso.c linux_timer.c \ opt_compat.h opt_inet6.h opt_posix.h opt_usb.h \ vnode_if.h device_if.h bus_if.h \ linux_support.s .if ${MACHINE_CPUARCH} == "amd64" -SRCS+= linux_dummy_x86.c +SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c .endif DPSRCS= assym.inc linux_genassym.c # XXX: for assym.inc SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h CLEANFILES= linux_assym.h linux_genassym.o linux_locore.o \ - genassym.o + genassym.o linux_vdso_gtod.o linux_vdso.so.o -OBJS= ${VDSO}.so +OBJS= linux_vdso.so linux_assym.h: linux_genassym.o sh ${SYSDIR}/kern/genassym.sh linux_genassym.o > ${.TARGET} -linux_locore.o: linux_locore.asm linux_assym.h - ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -shared -mcmodel=small \ - -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -fPIC -nostdinc \ - -Wl,-T${SRCTOP}/sys/${MACHINE}/linux/${VDSO}.lds.s \ - -Wl,-soname=${VDSO}.so.1,-warn-common -nostdlib \ +.if ${MACHINE_CPUARCH} == "amd64" +VDSOFLAGS=-mregparm=0 -mcmodel=small -msoft-float +VDSODEPS=linux_vdso_gettc_x86.inc +.elif ${MACHINE_CPUARCH} == "aarch64" +# The Linux uses tiny memory model, but our ld does not know about +# some of relocation types which is generated by cc +VDSOFLAGS=-mgeneral-regs-only -mcmodel=small -ffixed-x18 +.endif + +linux_locore.o: linux_assym.h assym.inc + ${CC} -c -x assembler-with-cpp -DLOCORE \ + -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \ + -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} +linux_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS} + ${CC} -c -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \ + -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ + ${.IMPSRC} -o ${.TARGET} + +linux_vdso.so.o: linux_locore.o linux_vdso_gtod.o + ${LD} --shared --eh-frame-hdr -soname=linux-vdso.so.1 \ + --no-undefined --hash-style=both -warn-common -nostdlib \ + --strip-debug -s --build-id=sha1 -Bsymbolic \ + -T${SRCTOP}/sys/${MACHINE}/linux/linux_vdso.lds.s \ + -o ${.TARGET} ${.ALLSRC:M*.o} + .if ${MACHINE_CPUARCH} == "aarch64" OBJCOPY_TARGET=--output-target elf64-littleaarch64 --binary-architecture aarch64 .elif ${MACHINE_CPUARCH} == "amd64" OBJCOPY_TARGET=--output-target elf64-x86-64 --binary-architecture i386:x86-64 .else .error ${MACHINE_CPUARCH} not yet supported by linux64 .endif -${VDSO}.so: linux_locore.o + +linux_vdso.so: linux_vdso.so.o ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \ - linux_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET} + linux_vdso.so.o ${.TARGET} + ${STRIPBIN} -N _binary_linux_vdso_so_o_size ${.TARGET} linux_support.o: assym.inc linux_assym.h ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} linux_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense .endif .include diff --git a/sys/x86/linux/linux_vdso_gettc_x86.inc b/sys/x86/linux/linux_vdso_gettc_x86.inc new file mode 100644 index 000000000000..ade78a03486b --- /dev/null +++ b/sys/x86/linux/linux_vdso_gettc_x86.inc @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2016, 2017, 2019 The FreeBSD Foundation + * Copyright (c) 2021 Dmitry Chagin + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#if defined(__i386__) || defined(COMPAT_LINUX32) +#include +#include +#else +#include +#include +#endif + +static inline u_int +rdtsc_low(const struct vdso_timehands *th) +{ + u_int rv; + + __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" + : "=a" (rv) : "c" (th->th_x86_shift) : "edx"); + return (rv); +} + +static inline u_int +rdtscp_low(const struct vdso_timehands *th) +{ + u_int rv; + + __asm __volatile("rdtscp; movl %%edi,%%ecx; shrd %%cl, %%edx, %0" + : "=a" (rv) : "D" (th->th_x86_shift) : "ecx", "edx"); + return (rv); +} + +static u_int +rdtsc_low_mb_lfence(const struct vdso_timehands *th) +{ + lfence(); + return (rdtsc_low(th)); +} + +static u_int +rdtsc_low_mb_mfence(const struct vdso_timehands *th) +{ + mfence(); + return (rdtsc_low(th)); +} + +static u_int +rdtsc_low_mb_none(const struct vdso_timehands *th) +{ + return (rdtsc_low(th)); +} + +static u_int +rdtsc32_mb_lfence(void) +{ + lfence(); + return (rdtsc32()); +} + +static u_int +rdtsc32_mb_mfence(void) +{ + mfence(); + return (rdtsc32()); +} + +static u_int +rdtsc32_mb_none(void) +{ + return (rdtsc32()); +} + +static u_int +rdtscp32_(void) +{ + return (rdtscp32()); +} + +struct tsc_selector_tag { + u_int (*ts_rdtsc32)(void); + u_int (*ts_rdtsc_low)(const struct vdso_timehands *); +}; + +static const struct tsc_selector_tag tsc_selector[] = { + [0] = { /* Intel, LFENCE */ + .ts_rdtsc32 = rdtsc32_mb_lfence, + .ts_rdtsc_low = rdtsc_low_mb_lfence, + }, + [1] = { /* AMD, MFENCE */ + .ts_rdtsc32 = rdtsc32_mb_mfence, + .ts_rdtsc_low = rdtsc_low_mb_mfence, + }, + [2] = { /* No SSE2 */ + .ts_rdtsc32 = rdtsc32_mb_none, + .ts_rdtsc_low = rdtsc_low_mb_none, + }, + [3] = { /* RDTSCP */ + .ts_rdtsc32 = rdtscp32_, + .ts_rdtsc_low = rdtscp_low, + }, +}; + +static u_int +__vdso_gettc_rdtsc_low(const struct vdso_timehands *th) +{ + + return (tsc_selector[kern_tsc_selector].ts_rdtsc_low(th)); +} + +static u_int +__vdso_gettc_rdtsc32(void) +{ + + return (tsc_selector[kern_tsc_selector].ts_rdtsc32()); +} + +int +__vdso_gettc(const struct vdso_timehands *th, u_int *tc) +{ + + switch (th->th_algo) { + case VDSO_TH_ALGO_X86_TSC: + *tc = th->th_x86_shift > 0 ? __vdso_gettc_rdtsc_low(th) : + __vdso_gettc_rdtsc32(); + return (0); + case VDSO_TH_ALGO_X86_HPET: + /* TODO */ + default: + return (ENOSYS); + } +} diff --git a/sys/x86/linux/linux_vdso_tsc_selector_x86.c b/sys/x86/linux/linux_vdso_tsc_selector_x86.c new file mode 100644 index 000000000000..a3a65a6d337d --- /dev/null +++ b/sys/x86/linux/linux_vdso_tsc_selector_x86.c @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2016, 2017, 2019 The FreeBSD Foundation + * Copyright (c) 2021 Dmitry Chagin + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include + +int +linux_vdso_tsc_selector_idx() +{ + bool amd_cpu; + + if (cpu_feature == 0) + return (2); /* should not happen due to RDTSC */ + + amd_cpu = (cpu_vendor_id == CPU_VENDOR_AMD || + cpu_vendor_id == CPU_VENDOR_HYGON); + + if ((amd_feature & AMDID_RDTSCP) != 0) + return (3); + if ((cpu_feature & CPUID_SSE2) == 0) + return (2); + return (amd_cpu ? 1 : 0); +} diff --git a/sys/x86/linux/linux_x86.h b/sys/x86/linux/linux_x86.h new file mode 100644 index 000000000000..73736eb7eb84 --- /dev/null +++ b/sys/x86/linux/linux_x86.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _X86_INCLUDE_LINUX_LINUX_X86_H_ +#define _X86_INCLUDE_LINUX_LINUX_X86_H_ + +int linux_vdso_tsc_selector_idx(void); + +#endif /* _X86_INCLUDE_LINUX_LINUX_X86_H_ */