Index: sys/amd64/linux/linux_sysvec.c =================================================================== --- sys/amd64/linux/linux_sysvec.c +++ sys/amd64/linux/linux_sysvec.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ #include #include +#include #include #include #include @@ -85,11 +87,24 @@ MODULE_VERSION(linux64, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \ + LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48 +#define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \ + sizeof(struct ps_strings)) + static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; @@ -102,10 +117,13 @@ static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); +static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_vsyscall(struct thread *td); #define LINUX_T_UNKNOWN 255 @@ -149,6 +167,8 @@ LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode); LINUX_VDSO_SYM_CHAR(linux_platform); +LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); +LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); /* * If FreeBSD & Linux have a difference of opinion about what a trap @@ -261,8 +281,7 @@ M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); @@ -736,7 +755,7 @@ .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_rt_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, @@ -744,8 +763,8 @@ .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, - .sv_usrstack = USRSTACK_LA48, - .sv_psstrings = PS_STRINGS_LA48, + .sv_usrstack = LINUX_USRSTACK_LA48, + .sv_psstrings = LINUX_PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, @@ -753,57 +772,148 @@ .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | - SV_SIG_WAITNDQ, + SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE_LA48, + .sv_shared_page_base = LINUX_SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, - .sv_onexec = linux_on_exec, + .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; -static void -linux_vdso_install(void *param) +static int +linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { + int error; - amd64_lower_shared_page(&elf_linux_sysvec); + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); +} - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); +static void +linux_exec_sysvec_init(void *param) +{ + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + amd64_lower_shared_page(sv); + /* Fill timekeep_base */ + exec_sysvec_init_abi(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; + + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX_VDSOPAGE_LA48; + if (hw_lower_amd64_sharedpage != 0) + linux_vdso_base -= PAGE_SIZE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Ehdr *ehdr; + const Elf_Shdr *shdr; + Elf64_Addr *where, val; + Elf_Size rtype, symidx; + const Elf_Rela *rela; + Elf_Addr addr, addend; + int relacnt; + int i, j; + + MPASS(offset != 0); + + relacnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + printf("Linux x86_64 vDSO: unexpected Rel section\n"); + break; + case SHT_RELA: + rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); + relacnt = shdr[i].sh_size / sizeof(*rela); + } + } + + for (j = 0; j < relacnt; j++, rela++) { + where = (Elf_Addr *)(mapping + rela->r_offset); + addend = rela->r_addend; + rtype = ELF_R_TYPE(rela->r_info); + symidx = ELF_R_SYM(rela->r_info); + + switch (rtype) { + case R_X86_64_NONE: /* none */ + break; + + case R_X86_64_RELATIVE: /* B + A */ + addr = (Elf_Addr)(offset + addend); + val = addr; + if (*where != val) + *where = val; + break; + case R_X86_64_IRELATIVE: + printf("Linux x86_64 vDSO: unexpected ifunc relocation, " + "symbol index %ld\n", symidx); + break; + default: + printf("Linux x86_64 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", rtype, symidx); + } + } +} + static char GNULINUX_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; Index: sys/amd64/linux/linux_vdso.lds.s =================================================================== --- sys/amd64/linux/linux_vdso.lds.s +++ sys/amd64/linux/linux_vdso.lds.s @@ -54,16 +54,20 @@ { LINUX_2.6 { global: - time; __vdso_time; - gettimeofday; __vdso_gettimeofday; - getcpu; __vdso_getcpu; - clock_gettime; __vdso_clock_gettime; + __vdso_clock_getres; + local: *; + }; + + LINUX_0.0 { + global: linux_rt_sigcode; linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } Index: sys/amd64/linux/linux_vdso_gtod.c =================================================================== --- /dev/null +++ sys/amd64/linux/linux_vdso_gtod.c @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +/* for debug purpose */ +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_write), "D"(fd), "S"(buf), "d"(size) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime), "D"(clock_id), "S"(ts) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_gettimeofday), "D"(tv), "S"(tz) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_getres), "D"(clock_id), "S"(ts) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_getcpu_fallback(uint32_t *cpu, uint32_t *node, void *cache) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_getcpu), "D"(cpu), "S"(node), "d"(cache) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "syscall" + : "=a"(res) + : "a"(LINUX_SYS_linux_time), "D"(tm) + : "cc", "rcx", "r11", "memory" + ); + return (res); +} + +#include Index: sys/amd64/linux32/linux32_locore.asm =================================================================== --- sys/amd64/linux32/linux32_locore.asm +++ sys/amd64/linux32/linux32_locore.asm @@ -18,7 +18,7 @@ * To avoid excess stack frame the signal trampoline code emulates * the 'call' instruction. */ -ENTRY(linux32_sigcode) +ENTRY(__kernel_sigreturn) movl %esp, %ebx /* preserve sigframe */ call .getip0 .getip0: @@ -33,7 +33,7 @@ .endsigcode: 0: jmp 0b -ENTRY(linux32_rt_sigcode) +ENTRY(__kernel_rt_sigreturn) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */ movl %esp, %edi @@ -49,7 +49,7 @@ .endrtsigcode: 0: jmp 0b -ENTRY(linux32_vsyscall) +ENTRY(__kernel_vsyscall) .startvsyscall: int $0x80 ret Index: sys/amd64/linux32/linux32_sysvec.c =================================================================== --- sys/amd64/linux32/linux32_sysvec.c +++ sys/amd64/linux32/linux32_sysvec.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,7 @@ #include #include +#include #include #include #include @@ -91,14 +93,21 @@ MODULE_VERSION(linux, 1); #define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE) -#define LINUX32_SHAREDPAGE (LINUX32_MAXUSER - PAGE_SIZE) +#define LINUX32_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX32_VDSOPAGE (LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE) +#define LINUX32_SHAREDPAGE (LINUX32_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ #define LINUX32_USRSTACK LINUX32_SHAREDPAGE static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux32_locore_o_start; -extern char _binary_linux32_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux32_vdso_so_o_start; +extern char _binary_linux32_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; @@ -111,10 +120,14 @@ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static int linux_on_exec_vmspace(struct proc *p, + struct image_params *imgp); +static void linux_exec_sysvec_init(void *param); static void linux32_fixlimit(struct rlimit *rl, int which); static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux32_set_syscall_retval(struct thread *td, int error); #define LINUX_T_UNKNOWN 255 @@ -165,9 +178,11 @@ #define LINUX32_PS_STRINGS (LINUX32_USRSTACK - \ sizeof(struct linux32_ps_strings)) -LINUX_VDSO_SYM_INTPTR(linux32_sigcode); -LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode); -LINUX_VDSO_SYM_INTPTR(linux32_vsyscall); +LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall); +LINUX_VDSO_SYM_INTPTR(__kernel_sigreturn); +LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn); +LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); +LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); LINUX_VDSO_SYM_CHAR(linux_platform); /* @@ -204,9 +219,8 @@ M_WAITOK | M_ZERO); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux32_vsyscall); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); @@ -352,7 +366,7 @@ /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); - regs->tf_rip = linux32_rt_sigcode; + regs->tf_rip = __kernel_rt_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; @@ -458,7 +472,7 @@ /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); - regs->tf_rip = linux32_sigcode; + regs->tf_rip = __kernel_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; @@ -908,7 +922,7 @@ .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux32_locore_o_start, + .sv_sigcode = &_binary_linux32_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, @@ -925,7 +939,7 @@ .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP | - SV_SIG_DISCIGN | SV_SIG_WAITNDQ, + SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux32_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, @@ -934,46 +948,136 @@ .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, - .sv_onexec = linux_on_exec, + .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; +static int +linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) +{ + int error; + + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX32_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); +} + static void -linux_vdso_install(void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init_abi(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - linux_szsigcode = (&_binary_linux32_locore_o_end - - &_binary_linux32_locore_o_start); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux32_vdso_so_o_start; + char *vdso_end = &_binary_linux32_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX32_VDSOPAGE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Shdr *shdr; + const Elf_Rel *rel; + const Elf_Ehdr *ehdr; + Elf32_Addr *where; + Elf_Size rtype, symidx; + Elf32_Addr addr, addend; + int i, relcnt; + + MPASS(offset != 0); + + relcnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); + relcnt = shdr[i].sh_size / sizeof(*rel); + break; + case SHT_RELA: + printf("Linux i386 vDSO: unexpected Rela section\n"); + break; + } + } + + for (i = 0; i < relcnt; i++, rel++) { + where = (Elf32_Addr *)(mapping + rel->r_offset); + addend = *where; + rtype = ELF_R_TYPE(rel->r_info); + symidx = ELF_R_SYM(rel->r_info); + + switch (rtype) { + case R_386_NONE: /* none */ + break; + + case R_386_RELATIVE: /* B + A */ + addr = (Elf32_Addr)PTROUT(offset + addend); + if (*where != addr) + *where = addr; + break; + + case R_386_IRELATIVE: + printf("Linux i386 vDSO: unexpected ifunc relocation, " + "symbol index %ld\n", (intmax_t)symidx); + break; + default: + printf("Linux i386 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; Index: sys/amd64/linux32/linux32_vdso.lds.s =================================================================== --- sys/amd64/linux32/linux32_vdso.lds.s +++ sys/amd64/linux32/linux32_vdso.lds.s @@ -51,16 +51,30 @@ eh_frame_hdr PT_GNU_EH_FRAME; } -ENTRY(linux32_vsyscall); - VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; + }; + LINUX_2.5 { global: - linux32_vsyscall; - linux32_sigcode; - linux32_rt_sigcode; + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; + + LINUX_0.0 { + global: linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } Index: sys/amd64/linux32/linux32_vdso_gtod.c =================================================================== --- /dev/null +++ sys/amd64/linux32/linux32_vdso_gtod.c @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_write), "b"(fd), "c"(buf), "d"(size) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_gettimeofday), "b"(tv), "c"(tz) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_clock_getres), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX32_SYS_linux_time), "b"(tm) + : "cc", "memory" + ); + return (res); +} + +#include Index: sys/arm64/linux/linux_sysvec.c =================================================================== --- sys/arm64/linux/linux_sysvec.c +++ sys/arm64/linux/linux_sysvec.c @@ -41,10 +41,17 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -65,11 +72,24 @@ MODULE_VERSION(linux64elf, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - \ + LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK LINUX_SHAREDPAGE +#define LINUX_PS_STRINGS (LINUX_USRSTACK - \ + sizeof(struct ps_strings)) + static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; @@ -82,10 +102,13 @@ static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); +static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_vsyscall(struct thread *td); /* DTrace init */ @@ -102,6 +125,10 @@ LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_install, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_deinstall, todo); +LINUX_VDSO_SYM_CHAR(linux_platform); +LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); +LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn); + /* LINUXTODO: do we have traps to translate? */ static int linux_translate_traps(int signal, int trap_code) @@ -111,8 +138,6 @@ return (signal); } -LINUX_VDSO_SYM_CHAR(linux_platform); - static int linux_fetch_syscall_args(struct thread *td) { @@ -168,8 +193,7 @@ M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, *imgp->sysent->sv_hwcap); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); @@ -412,7 +436,7 @@ .sv_transtrap = linux_translate_traps, .sv_fixup = linux_elf_fixup, .sv_sendsig = linux_rt_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, @@ -420,8 +444,8 @@ .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, /* XXX */ + .sv_usrstack = LINUX_USRSTACK, + .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, @@ -429,58 +453,135 @@ .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | - SV_SIG_WAITNDQ, + SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, .sv_hwcap = &elf_hwcap, .sv_hwcap2 = &elf_hwcap2, - .sv_onexec = linux_on_exec, + .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; +static int +linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) +{ + int error; + + error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, + LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); +} + static void -linux_vdso_install(const void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init_abi(sv); - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("invalid Linux VDSO size\n"); +static void +linux_vdso_install(const void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + linux_vdso_base = LINUX_VDSOPAGE; - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - memcpy(linux_shared_page_mapping, elf_linux_sysvec.sv_sigcode, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); + + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { - LIN_SDT_PROBE0(sysvec, linux_vdso_deinstall, todo); - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + Elf_Size rtype, symidx; + const Elf_Rela *rela; + const Elf_Shdr *shdr; + const Elf_Ehdr *ehdr; + Elf_Addr *where; + Elf_Addr addr, addend; + int i, relacnt; + + MPASS(offset != 0); + + relacnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + printf("Linux Aarch64 vDSO: unexpected Rel section\n"); + break; + case SHT_RELA: + rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); + relacnt = shdr[i].sh_size / sizeof(*rela); + } + } + + for (i = 0; i < relacnt; i++, rela++) { + where = (Elf_Addr *)(mapping + rela->r_offset); + addend = rela->r_addend; + rtype = ELF_R_TYPE(rela->r_info); + symidx = ELF_R_SYM(rela->r_info); + + switch (rtype) { + case R_AARCH64_NONE: /* none */ + break; + + case R_AARCH64_RELATIVE: /* B + A */ + addr = (Elf_Addr)(mapping + addend); + if (*where != addr) + *where = addr; + break; + default: + printf("Linux Aarch64 vDSO: unexpected relocation type %ld, " + "symbol index %ld\n", rtype, symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNU_ABI_LINUX = 0; Index: sys/arm64/linux/linux_vdso.lds.s =================================================================== --- sys/arm64/linux/linux_vdso.lds.s +++ sys/arm64/linux/linux_vdso.lds.s @@ -1,6 +1,6 @@ /* - * Stub arm64 vdso linker script. - * LINUXTODO: update along with VDSO implementation + * Linker script for 64-bit vDSO. + * Copied from Linux kernel arch/x86/vdso/vdso-layout.lds.S * * $FreeBSD$ */ @@ -8,15 +8,66 @@ SECTIONS { . = . + SIZEOF_HEADERS; - .text : { *(.text*) } - .rodata : { *(.rodata*) } - .hash : { *(.hash) } + + .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } - .data : { *(.data*) } - .dynamic : { *(.dynamic) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + . = ALIGN(0x100); + .text : { *(.test .text*) } :text =0x90909090 +} + +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.6.39 { + global: + __kernel_rt_sigreturn; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + local: *; + }; + + LINUX_0.0 { + global: + linux_platform; + kern_timekeep_base; + local: *; + }; } Index: sys/arm64/linux/linux_vdso_gtod.c =================================================================== --- /dev/null +++ sys/arm64/linux/linux_vdso_gtod.c @@ -0,0 +1,153 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +static int +write(int lfd, const void *lbuf, size_t lsize) +{ + register long svc asm("x8") = LINUX_SYS_linux_clock_gettime; + register int fd asm("x0") = lfd; + register const char *buf asm("x1") = lbuf; + register long size asm("x2") = lsize; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (fd), "r" (buf), "r" (size), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *lts) +{ + register long svc asm("x8") = LINUX_SYS_linux_clock_gettime; + register clockid_t clockid asm("x0") = clock_id; + register struct l_timespec *ts asm("x1") = lts; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (clockid), "r" (ts), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *ltv, struct timezone *ltz) +{ + register long svc asm("x8") = LINUX_SYS_gettimeofday; + register l_timeval *tv asm("x0") = ltv; + register struct timezone *tz asm("x1") = ltz; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (tv), "r" (tz), "r" (svc) + : "memory"); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *lts) +{ + register long svc asm("x8") = LINUX_SYS_linux_clock_getres; + register clockid_t clockid asm("x0") = clock_id; + register struct l_timespec *ts asm("x1") = lts; + register long res asm ("x0"); + + asm volatile( + " svc #0\n" + : "=r" (res) + : "r" (clockid), "r" (ts), "r" (svc) + : "memory"); + return (res); +} + +/* + * copied from lib/libc/aarch64/sys/__vdso_gettc.c + */ + +static inline uint64_t +cp15_cntvct_get(void) +{ + uint64_t reg; + + __asm __volatile("mrs %0, cntvct_el0" : "=r" (reg)); + return (reg); +} + +static inline uint64_t +cp15_cntpct_get(void) +{ + uint64_t reg; + + __asm __volatile("mrs %0, cntpct_el0" : "=r" (reg)); + return (reg); +} + +int +__vdso_gettc(const struct vdso_timehands *th, u_int *tc) +{ + + if (th->th_algo != VDSO_TH_ALGO_ARM_GENTIM) + return (ENOSYS); + __asm __volatile("isb" : : : "memory"); + *tc = th->th_physical == 0 ? cp15_cntvct_get() : cp15_cntpct_get(); + return (0); +} + +#include Index: sys/compat/linux/linux_vdso.h =================================================================== --- sys/compat/linux/linux_vdso.h +++ sys/compat/linux/linux_vdso.h @@ -38,12 +38,14 @@ char symname[]; }; -vm_object_t __elfN(linux_shared_page_init)(char **); -void __elfN(linux_shared_page_fini)(vm_object_t, void *); -void __elfN(linux_vdso_fixup)(struct sysentvec *); -void __elfN(linux_vdso_reloc)(struct sysentvec *); +vm_object_t __elfN(linux_shared_page_init)(char **, vm_size_t); +void __elfN(linux_shared_page_fini)(vm_object_t, void *, vm_size_t); +void __elfN(linux_vdso_fixup)(char *, vm_offset_t); void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *); +int linux_map_vdso(struct proc *, vm_object_t, vm_offset_t, + vm_offset_t, struct image_params *); + #define LINUX_VDSO_SYM_INTPTR(name) \ uintptr_t name; \ LINUX_VDSO_SYM_DEFINE(name) Index: sys/compat/linux/linux_vdso.c =================================================================== --- sys/compat/linux/linux_vdso.c +++ sys/compat/linux/linux_vdso.c @@ -38,17 +38,16 @@ #include #include #include +#include #include -#include +#include #include #include #include -#include #include #include #include -#include #include #include #include @@ -59,12 +58,6 @@ SLIST_HEAD(, linux_vdso_sym) __elfN(linux_vdso_syms) = SLIST_HEAD_INITIALIZER(__elfN(linux_vdso_syms)); -static int __elfN(symtabindex); -static int __elfN(symstrindex); - -static void -__elfN(linux_vdso_lookup)(Elf_Ehdr *, struct linux_vdso_sym *); - void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *s) { @@ -73,176 +66,119 @@ } vm_object_t -__elfN(linux_shared_page_init)(char **mapping) +__elfN(linux_shared_page_init)(char **mapping, vm_size_t size) { vm_page_t m; vm_object_t obj; vm_offset_t addr; + size_t n, pages; + + pages = size / PAGE_SIZE; - obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE, + addr = kva_alloc(size); + obj = vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_WLOCK(obj); - m = vm_page_grab(obj, 0, VM_ALLOC_ZERO); + for (n = 0; n < pages; n++) { + m = vm_page_grab(obj, n, + VM_ALLOC_ZERO); + vm_page_valid(m); + vm_page_xunbusy(m); + pmap_qenter(addr + n * PAGE_SIZE, &m, 1); + } VM_OBJECT_WUNLOCK(obj); - vm_page_valid(m); - vm_page_xunbusy(m); - addr = kva_alloc(PAGE_SIZE); - pmap_qenter(addr, &m, 1); *mapping = (char *)addr; return (obj); } void -__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping) +__elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping, + vm_size_t size) { vm_offset_t va; va = (vm_offset_t)mapping; - pmap_qremove(va, 1); - kva_free(va, PAGE_SIZE); + pmap_qremove(va, size / PAGE_SIZE); + kva_free(va, size); vm_object_deallocate(obj); } void -__elfN(linux_vdso_fixup)(struct sysentvec *sv) +__elfN(linux_vdso_fixup)(char *base, vm_offset_t offset) { + struct linux_vdso_sym *lsym; + const Elf_Shdr *shdr; Elf_Ehdr *ehdr; - Elf_Shdr *shdr; - int i; + Elf_Sym *dsym, *sym; + char *strtab, *symname; + int i, symcnt; - ehdr = (Elf_Ehdr *) sv->sv_sigcode; + ehdr = (Elf_Ehdr *)base; - if (!IS_ELF(*ehdr) || - ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || - ehdr->e_ident[EI_DATA] != ELF_TARG_DATA || - ehdr->e_ident[EI_VERSION] != EV_CURRENT || - ehdr->e_shoff == 0 || - ehdr->e_shentsize != sizeof(Elf_Shdr)) - panic("Linux invalid vdso header.\n"); + MPASS(IS_ELF(*ehdr)); + MPASS(ehdr->e_ident[EI_CLASS] == ELF_TARG_CLASS); + MPASS(ehdr->e_ident[EI_DATA] == ELF_TARG_DATA); + MPASS(ehdr->e_ident[EI_VERSION] == EV_CURRENT); + MPASS(ehdr->e_shentsize == sizeof(Elf_Shdr)); + MPASS(ehdr->e_shoff != 0); + MPASS(ehdr->e_type == ET_DYN); - if (ehdr->e_type != ET_DYN) - panic("Linux invalid vdso header.\n"); + shdr = (const Elf_Shdr *)(base + ehdr->e_shoff); - shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff); - - __elfN(symtabindex) = -1; - __elfN(symstrindex) = -1; + dsym = NULL; for (i = 0; i < ehdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; if (shdr[i].sh_type == SHT_DYNSYM) { - __elfN(symtabindex) = i; - __elfN(symstrindex) = shdr[i].sh_link; + dsym = (Elf_Sym *)(base + shdr[i].sh_offset); + strtab = base + shdr[shdr[i].sh_link].sh_offset; + symcnt = shdr[i].sh_size / sizeof(*dsym); + break; } } - - if (__elfN(symtabindex) == -1 || __elfN(symstrindex) == -1) - panic("Linux invalid vdso header.\n"); + MPASS(dsym != NULL); ehdr->e_ident[EI_OSABI] = ELFOSABI_LINUX; -} -void -__elfN(linux_vdso_reloc)(struct sysentvec *sv) -{ - struct linux_vdso_sym *lsym; - Elf_Ehdr *ehdr; - Elf_Phdr *phdr; - Elf_Shdr *shdr; - Elf_Dyn *dyn; - Elf_Sym *sym; - int i, j, symcnt; - - ehdr = (Elf_Ehdr *) sv->sv_sigcode; - - /* Adjust our so relative to the sigcode_base */ - if (sv->sv_shared_page_base != 0) { - ehdr->e_entry += sv->sv_shared_page_base; - phdr = (Elf_Phdr *)((caddr_t)ehdr + ehdr->e_phoff); - - /* phdrs */ - for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += sv->sv_shared_page_base; - if (phdr[i].p_type != PT_DYNAMIC) - continue; - dyn = (Elf_Dyn *)((caddr_t)ehdr + phdr[i].p_offset); - for(; dyn->d_tag != DT_NULL; dyn++) { - switch (dyn->d_tag) { - case DT_PLTGOT: - case DT_HASH: - case DT_STRTAB: - case DT_SYMTAB: - case DT_RELA: - case DT_INIT: - case DT_FINI: - case DT_REL: - case DT_DEBUG: - case DT_JMPREL: - case DT_VERSYM: - case DT_VERDEF: - case DT_VERNEED: - case DT_ADDRRNGLO ... DT_ADDRRNGHI: - dyn->d_un.d_ptr += sv->sv_shared_page_base; - break; - case DT_ENCODING ... DT_LOOS-1: - case DT_LOOS ... DT_HIOS: - if (dyn->d_tag >= DT_ENCODING && - (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += sv->sv_shared_page_base; - break; - default: - break; - } - } - } + /* + * VDSO is readonly mapped to the process VA and + * can't be relocated by rtld. + */ + SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) { + for (i = 0, sym = dsym; i < symcnt; i++, sym++) { + symname = strtab + sym->st_name; + if (strncmp(lsym->symname, symname, lsym->size) == 0) { + sym->st_value += offset; + *lsym->ptr = sym->st_value; + break; - /* sections */ - shdr = (Elf_Shdr *)((caddr_t)ehdr + ehdr->e_shoff); - for(i = 0; i < ehdr->e_shnum; i++) { - if (!(shdr[i].sh_flags & SHF_ALLOC)) - continue; - shdr[i].sh_addr += sv->sv_shared_page_base; - if (shdr[i].sh_type != SHT_SYMTAB && - shdr[i].sh_type != SHT_DYNSYM) - continue; - - sym = (Elf_Sym *)((caddr_t)ehdr + shdr[i].sh_offset); - symcnt = shdr[i].sh_size / sizeof(*sym); - - for(j = 0; j < symcnt; j++, sym++) { - if (sym->st_shndx == SHN_UNDEF || - sym->st_shndx == SHN_ABS) - continue; - sym->st_value += sv->sv_shared_page_base; } } } - - SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) - __elfN(linux_vdso_lookup)(ehdr, lsym); } -static void -__elfN(linux_vdso_lookup)(Elf_Ehdr *ehdr, struct linux_vdso_sym *vsym) +int +linux_map_vdso(struct proc *p, vm_object_t obj, vm_offset_t base, + vm_offset_t size, struct image_params *imgp) { - vm_offset_t strtab, symname; - uint32_t symcnt; - Elf_Shdr *shdr; - int i; - - shdr = (Elf_Shdr *) ((caddr_t)ehdr + ehdr->e_shoff); - - strtab = (vm_offset_t)((caddr_t)ehdr + - shdr[__elfN(symstrindex)].sh_offset); - Elf_Sym *sym = (Elf_Sym *)((caddr_t)ehdr + - shdr[__elfN(symtabindex)].sh_offset); - symcnt = shdr[__elfN(symtabindex)].sh_size / sizeof(*sym); - - for (i = 0; i < symcnt; ++i, ++sym) { - symname = strtab + sym->st_name; - if (strncmp(vsym->symname, (char *)symname, vsym->size) == 0) { - *vsym->ptr = (uintptr_t)sym->st_value; - break; - } + struct vmspace *vmspace; + vm_map_t map; + int error; + + MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX); + MPASS(obj != NULL); + + vmspace = p->p_vmspace; + map = &vmspace->vm_map; + + vm_object_reference(obj); + error = vm_map_fixed(map, obj, 0, base, size, + VM_PROT_READ | VM_PROT_EXECUTE, + VM_PROT_READ | VM_PROT_EXECUTE, + MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + return (vm_mmap_to_errno(error)); } + return (0); } Index: sys/compat/linux/linux_vdso_gtod.inc =================================================================== --- /dev/null +++ sys/compat/linux/linux_vdso_gtod.inc @@ -0,0 +1,338 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +static int +__vdso_native_to_linux_timespec(struct l_timespec *lts, + struct timespec *nts) +{ + +#ifdef COMPAT_LINUX32 + if (nts->tv_sec > INT_MAX || nts->tv_sec < INT_MIN) + return (LINUX_EOVERFLOW); +#endif + lts->tv_sec = nts->tv_sec; + lts->tv_nsec = nts->tv_nsec; + return (0); +} + +static int +__vdso_native_to_linux_timeval(l_timeval *ltv, + struct timeval *ntv) +{ + +#ifdef COMPAT_LINUX32 + if (ntv->tv_sec > INT_MAX || ntv->tv_sec < INT_MIN) + return (LINUX_EOVERFLOW); +#endif + ltv->tv_sec = ntv->tv_sec; + ltv->tv_usec = ntv->tv_usec; + return (0); +} + + +#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) +static int +__vdso_native_to_linux_timespec64(struct l_timespec64 *lts, + struct timespec *nts) +{ + + lts->tv_sec = nts->tv_sec; + lts->tv_nsec = nts->tv_nsec; + return (0); +} +#endif + +static int +__vdso_linux_to_native_clockid(clockid_t *n, clockid_t l) +{ + + switch (l) { + case LINUX_CLOCK_REALTIME: + *n = CLOCK_REALTIME; + break; + case LINUX_CLOCK_MONOTONIC: + *n = CLOCK_MONOTONIC; + break; + case LINUX_CLOCK_REALTIME_COARSE: + *n = CLOCK_REALTIME_FAST; + break; + case LINUX_CLOCK_MONOTONIC_COARSE: + case LINUX_CLOCK_MONOTONIC_RAW: + *n = CLOCK_MONOTONIC_FAST; + break; + case LINUX_CLOCK_BOOTTIME: + *n = CLOCK_UPTIME; + break; + default: + return (LINUX_EINVAL); + } + return (0); +} + +/* + * The code below adapted from + * lib/libc/sys/__vdso_gettimeofday.c + */ + +static inline void +__vdso_gettimekeep(struct vdso_timekeep **tk) +{ + + *tk = (struct vdso_timekeep *)kern_timekeep_base; +} + +static int +tc_delta(const struct vdso_timehands *th, u_int *delta) +{ + int error; + u_int tc; + + error = __vdso_gettc(th, &tc); + if (error == 0) + *delta = (tc - th->th_offset_count) & th->th_counter_mask; + return (error); +} + +/* + * Calculate the absolute or boot-relative time from the + * machine-specific fast timecounter and the published timehands + * structure read from the shared page. + * + * The lockless reading scheme is similar to the one used to read the + * in-kernel timehands, see sys/kern/kern_tc.c:binuptime(). This code + * is based on the kernel implementation. + */ +static int +freebsd_binuptime(struct bintime *bt, struct vdso_timekeep *tk, bool abs) +{ + struct vdso_timehands *th; + uint32_t curr, gen; + uint64_t scale, x; + u_int delta, scale_bits; + int error; + + do { + if (!tk->tk_enabled) + return (ENOSYS); + + curr = atomic_load_acq_32(&tk->tk_current); + th = &tk->tk_th[curr]; + gen = atomic_load_acq_32(&th->th_gen); + *bt = th->th_offset; + error = tc_delta(th, &delta); + if (error == EAGAIN) + continue; + if (error != 0) + return (error); + scale = th->th_scale; +#ifdef _LP64 + scale_bits = ffsl(scale); +#else + scale_bits = ffsll(scale); +#endif + if (__predict_false(scale_bits + fls(delta) > 63)) { + x = (scale >> 32) * delta; + scale &= 0xffffffff; + bt->sec += x >> 32; + bintime_addx(bt, x << 32); + } + bintime_addx(bt, scale * delta); + if (abs) + bintime_add(bt, &th->th_boottime); + + /* + * Ensure that the load of th_offset is completed + * before the load of th_gen. + */ + atomic_thread_fence_acq(); + } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen); + return (0); +} + +static int +freebsd_getnanouptime(struct bintime *bt, struct vdso_timekeep *tk) +{ + struct vdso_timehands *th; + uint32_t curr, gen; + + do { + if (!tk->tk_enabled) + return (ENOSYS); + + curr = atomic_load_acq_32(&tk->tk_current); + th = &tk->tk_th[curr]; + gen = atomic_load_acq_32(&th->th_gen); + *bt = th->th_offset; + + /* + * Ensure that the load of th_offset is completed + * before the load of th_gen. + */ + atomic_thread_fence_acq(); + } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen); + return (0); +} + +static int +freebsd_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + struct vdso_timekeep *tk; + struct bintime bt; + int error; + + if (tz != NULL) + return (ENOSYS); + __vdso_gettimekeep(&tk); + if (tk == NULL) + return (ENOSYS); + if (tk->tk_ver != VDSO_TK_VER_CURR) + return (ENOSYS); + error = freebsd_binuptime(&bt, tk, true); + if (error == 0) + bintime2timeval(&bt, tv); + return (error); +} + +static int +freebsd_clock_gettime(clockid_t clock_id, struct timespec *ts) +{ + struct vdso_timekeep *tk; + struct bintime bt; + int error; + + __vdso_gettimekeep(&tk); + if (tk == NULL) + return (ENOSYS); + if (tk->tk_ver != VDSO_TK_VER_CURR) + return (ENOSYS); + switch (clock_id) { + case CLOCK_REALTIME: + case CLOCK_REALTIME_PRECISE: + case CLOCK_REALTIME_FAST: + error = freebsd_binuptime(&bt, tk, true); + break; + case CLOCK_MONOTONIC: + case CLOCK_MONOTONIC_PRECISE: + case CLOCK_UPTIME: + case CLOCK_UPTIME_PRECISE: + error = freebsd_binuptime(&bt, tk, false); + break; + case CLOCK_MONOTONIC_FAST: + case CLOCK_UPTIME_FAST: + error = freebsd_getnanouptime(&bt, tk); + break; + default: + error = ENOSYS; + break; + } + if (error == 0) + bintime2timespec(&bt, ts); + return (error); +} + +/* + * Linux vDSO interfaces + * + */ +int +__vdso_clock_gettime(clockid_t clock_id, struct l_timespec *lts) +{ + struct timespec ts; + clockid_t which; + int error; + + error = __vdso_linux_to_native_clockid(&which, clock_id); + if (error != 0) + return (__vdso_clock_gettime_fallback(clock_id, lts)); + error = freebsd_clock_gettime(which, &ts); + if (error == 0) + return (-__vdso_native_to_linux_timespec(lts, &ts)); + else + return (__vdso_clock_gettime_fallback(clock_id, lts)); +} + +int +__vdso_gettimeofday(l_timeval *ltv, struct timezone *tz) +{ + struct timeval tv; + int error; + + error = freebsd_gettimeofday(&tv, tz); + if (error != 0) + return (__vdso_gettimeofday_fallback(ltv, tz)); + return (-__vdso_native_to_linux_timeval(ltv, &tv)); +} + +int +__vdso_clock_getres(clockid_t clock_id, struct l_timespec *lts) +{ + + return (__vdso_clock_getres_fallback(clock_id, lts)); +} + +#if defined(__i386__) || defined(COMPAT_LINUX32) +int +__vdso_clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts) +{ + struct timespec ts; + clockid_t which; + int error; + + error = __vdso_linux_to_native_clockid(&which, clock_id); + if (error != 0) + return (__vdso_clock_gettime64_fallback(clock_id, lts)); + error = freebsd_clock_gettime(which, &ts); + if (error == 0) + return(-__vdso_native_to_linux_timespec64(lts, &ts)); + else + return(__vdso_clock_gettime64_fallback(clock_id, lts)); +} + +int clock_gettime64(clockid_t clock_id, struct l_timespec64 *lts) + __attribute__((weak, alias("__vdso_clock_gettime64"))); + +#endif + +#if defined(__amd64__) && !defined(COMPAT_LINUX32) +int +__vdso_getcpu(uint32_t *cpu, uint32_t *node, void *cache) +{ + + return (__vdso_getcpu_fallback(cpu, node, cache)); +} +#endif + +#if defined(__i386__) || defined(__amd64__) +int +__vdso_time(long *tm) +{ + + return (__vdso_time_fallback(tm)); +} +#endif Index: sys/i386/linux/linux.h =================================================================== --- sys/i386/linux/linux.h +++ sys/i386/linux/linux.h @@ -39,9 +39,6 @@ #define LINUX_DTRACE linuxulator -#define LINUX_SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) -#define LINUX_USRSTACK LINUX_SHAREDPAGE - /* * Provide a separate set of types for the Linux types. */ Index: sys/i386/linux/linux_locore.asm =================================================================== --- sys/i386/linux/linux_locore.asm +++ sys/i386/linux/linux_locore.asm @@ -19,7 +19,7 @@ * To avoid excess stack frame the signal trampoline code emulates * the 'call' instruction. */ -ENTRY(linux_sigcode) +ENTRY(__kernel_sigreturn) movl %esp, %ebx /* preserve sigframe */ call .getip0 .getip0: @@ -34,7 +34,7 @@ .endsigcode: 0: jmp 0b -ENTRY(linux_rt_sigcode) +ENTRY(__kernel_rt_sigreturn) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */ movl %esp, %edi @@ -50,7 +50,7 @@ .endrtsigcode: 0: jmp 0b -ENTRY(linux_vsyscall) +ENTRY(__kernel_vsyscall) .startvsyscall: int $0x80 ret Index: sys/i386/linux/linux_sysvec.c =================================================================== --- sys/i386/linux/linux_sysvec.c +++ sys/i386/linux/linux_sysvec.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,7 @@ #include #include +#include #include #include #include @@ -75,13 +77,22 @@ MODULE_VERSION(linux, 1); +#define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 +#define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - LINUX_VDSOPAGE_SIZE) +#define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) + /* + * PAGE_SIZE - the size + * of the native SHAREDPAGE + */ +#define LINUX_USRSTACK LINUX_SHAREDPAGE #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; -static vm_object_t linux_shared_page_obj; -static char *linux_shared_page_mapping; -extern char _binary_linux_locore_o_start; -extern char _binary_linux_locore_o_end; +static vm_object_t linux_vdso_obj; +static char *linux_vdso_mapping; +extern char _binary_linux_vdso_so_o_start; +extern char _binary_linux_vdso_so_o_end; +static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; @@ -94,11 +105,15 @@ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); +static void linux_exec_sysvec_init(void *param); +static int linux_on_exec_vmspace(struct proc *p, + struct image_params *imgp); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); +static void linux_vdso_reloc(char *mapping, Elf_Addr offset); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { @@ -140,9 +155,11 @@ LINUX_T_UNKNOWN) LINUX_VDSO_SYM_CHAR(linux_platform); -LINUX_VDSO_SYM_INTPTR(linux_sigcode); -LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode); -LINUX_VDSO_SYM_INTPTR(linux_vsyscall); +LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall); +LINUX_VDSO_SYM_INTPTR(__kernel_sigreturn); +LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn); +LINUX_VDSO_SYM_INTPTR(kern_timekeep_base); +LINUX_VDSO_SYM_INTPTR(kern_tsc_selector); /* * If FreeBSD & Linux have a difference of opinion about what a trap @@ -200,9 +217,8 @@ argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, - imgp->proc->p_sysent->sv_shared_page_base); - AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); + AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* @@ -466,7 +482,7 @@ /* Build context to run handler in. */ regs->tf_esp = (int)fp; - regs->tf_eip = linux_rt_sigcode; + regs->tf_eip = __kernel_rt_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; @@ -568,7 +584,7 @@ /* Build context to run handler in. */ regs->tf_esp = (int)fp; - regs->tf_eip = linux_sigcode; + regs->tf_eip = __kernel_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; @@ -814,7 +830,7 @@ .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, @@ -837,7 +853,7 @@ .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, - .sv_onexec = linux_on_exec, + .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, @@ -850,7 +866,7 @@ .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, - .sv_sigcode = &_binary_linux_locore_o_start, + .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, @@ -867,7 +883,7 @@ .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP | - SV_SIG_DISCIGN | SV_SIG_WAITNDQ, + SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, @@ -876,46 +892,137 @@ .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, - .sv_onexec = linux_on_exec, + .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; +static int +linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) +{ + int error = 0; + + if (SV_PROC_FLAG(p, SV_SHP) != 0) + error = linux_map_vdso(p, linux_vdso_obj, + linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); + if (error == 0) + linux_on_exec(p, imgp); + return (error); +} + static void -linux_vdso_install(void *param) +linux_exec_sysvec_init(void *param) { + l_uintptr_t *ktimekeep_base, *ktsc_selector; + struct sysentvec *sv; + ptrdiff_t tkoff; + + sv = param; + /* Fill timekeep_base */ + exec_sysvec_init_abi(sv); + + tkoff = kern_timekeep_base - linux_vdso_base; + ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktimekeep_base = sv->sv_timekeep_base; + + tkoff = kern_tsc_selector - linux_vdso_base; + ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); + *ktsc_selector = linux_vdso_tsc_selector_idx(); + if (bootverbose) + printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); +} +SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY, + linux_exec_sysvec_init, &elf_linux_sysvec); - linux_szsigcode = (&_binary_linux_locore_o_end - - &_binary_linux_locore_o_start); +static void +linux_vdso_install(void *param) +{ + char *vdso_start = &_binary_linux_vdso_so_o_start; + char *vdso_end = &_binary_linux_vdso_so_o_end; - if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) - panic("Linux invalid vdso size\n"); + linux_szsigcode = vdso_end - vdso_start; + MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); - __elfN(linux_vdso_fixup)(&elf_linux_sysvec); + linux_vdso_base = LINUX_VDSOPAGE; - linux_shared_page_obj = __elfN(linux_shared_page_init) - (&linux_shared_page_mapping); + __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); - __elfN(linux_vdso_reloc)(&elf_linux_sysvec); + linux_vdso_obj = __elfN(linux_shared_page_init) + (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); + bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); - bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, - linux_szsigcode); - elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; + linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } -SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, +SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { - __elfN(linux_shared_page_fini)(linux_shared_page_obj, - linux_shared_page_mapping); + __elfN(linux_shared_page_fini)(linux_vdso_obj, + linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); +static void +linux_vdso_reloc(char *mapping, Elf_Addr offset) +{ + const Elf_Shdr *shdr; + const Elf_Rel *rel; + const Elf_Ehdr *ehdr; + Elf_Addr *where; + Elf_Size rtype, symidx; + Elf_Addr addr, addend; + int i, relcnt; + + MPASS(offset != 0); + + relcnt = 0; + ehdr = (const Elf_Ehdr *)mapping; + shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); + for (i = 0; i < ehdr->e_shnum; i++) + { + switch (shdr[i].sh_type) { + case SHT_REL: + rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); + relcnt = shdr[i].sh_size / sizeof(*rel); + break; + case SHT_RELA: + printf("Linux i386 vDSO: unexpected Rela section\n"); + break; + } + } + + for (i = 0; i < relcnt; i++, rel++) { + where = (Elf_Addr *)(mapping + rel->r_offset); + addend = *where; + rtype = ELF_R_TYPE(rel->r_info); + symidx = ELF_R_SYM(rel->r_info); + + switch (rtype) { + case R_386_NONE: /* none */ + break; + + case R_386_RELATIVE: /* B + A */ + addr = (Elf_Addr)PTROUT(offset + addend); + if (*where != addr) + *where = addr; + break; + + case R_386_IRELATIVE: + printf("Linux i386 vDSO: unexpected ifunc relocation, " + "symbol index %d\n", symidx); + break; + default: + printf("Linux i386 vDSO: unexpected relocation type %d, " + "symbol index %d\n", rtype, symidx); + } + } +} + static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; Index: sys/i386/linux/linux_vdso.lds.s =================================================================== --- sys/i386/linux/linux_vdso.lds.s +++ sys/i386/linux/linux_vdso.lds.s @@ -51,15 +51,30 @@ eh_frame_hdr PT_GNU_EH_FRAME; } -ENTRY(linux_vsyscall); - VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; + }; + LINUX_2.5 { global: - linux_vsyscall; - linux_sigcode; - linux_rt_sigcode; + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; + + LINUX_0.0 { + global: + linux_platform; + kern_timekeep_base; + kern_tsc_selector; local: *; }; } Index: sys/i386/linux/linux_vdso_gtod.c =================================================================== --- /dev/null +++ sys/i386/linux/linux_vdso_gtod.c @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#define _KERNEL +#include +#undef _KERNEL +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* The kernel fixup this at vDSO install */ +uintptr_t *kern_timekeep_base = NULL; +uint32_t kern_tsc_selector = 0; + +#include + +static int +write(int fd, const void *buf, size_t size) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_write), "b"(fd), "c"(buf), "d"(size) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_gettimeofday), "b"(tv), "c"(tz) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_clock_getres), "b"(clock_id), "c"(ts) + : "cc", "memory" + ); + return (res); +} + +static int +__vdso_time_fallback(long *tm) +{ + int res; + + __asm__ __volatile__ + ( + "int $0x80" + : "=a"(res) + : "a"(LINUX_SYS_linux_time), "b"(tm) + : "cc", "memory" + ); + return (res); +} + +#include Index: sys/kern/kern_exec.c =================================================================== --- sys/kern/kern_exec.c +++ sys/kern/kern_exec.c @@ -1112,8 +1112,11 @@ } } - if (sv->sv_onexec != NULL) - sv->sv_onexec(p, imgp); + if (sv->sv_onexec != NULL) { + error = sv->sv_onexec(p, imgp); + if (error != 0) + return (error); + } /* Allocate a new stack */ if (imgp->stack_sz != 0) { Index: sys/modules/linux/Makefile =================================================================== --- sys/modules/linux/Makefile +++ sys/modules/linux/Makefile @@ -10,8 +10,6 @@ .PATH: ${SRCTOP}/sys/x86/linux .endif -VDSO= linux${SFX}_vdso - KMOD= linux SRCS= linux_fork.c linux${SFX}_dummy_machdep.c linux_file.c linux_event.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ @@ -22,7 +20,8 @@ opt_inet6.h opt_compat.h opt_posix.h opt_usb.h vnode_if.h \ device_if.h bus_if.h .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" -SRCS+= linux_dummy_x86.c +SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c +VDSODEPS=linux_vdso_gettc_x86.inc .endif .if ${MACHINE_CPUARCH} == "amd64" SRCS+= linux${SFX}_support.s @@ -37,7 +36,7 @@ SRCS+= opt_apic.h .endif -OBJS= ${VDSO}.so +OBJS= linux${SFX}_vdso.so .if ${MACHINE_CPUARCH} == "i386" SRCS+= linux_ptrace.c imgact_linux.c linux_util.c linux_mib.c linux_mmap.c \ @@ -54,33 +53,54 @@ .endif CLEANFILES= linux${SFX}_assym.h linux${SFX}_genassym.o linux${SFX}_locore.o \ - genassym.o + genassym.o linux${SFX}_vdso_gtod.o linux${SFX}_vdso.so.o linux${SFX}_assym.h: linux${SFX}_genassym.o sh ${SYSDIR}/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET} +.if ${MACHINE_CPUARCH} == "amd64" +VDSOFLAGS=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 -m32 +.endif + linux${SFX}_locore.o: linux${SFX}_assym.h assym.inc - ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -m32 -shared -s \ - -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -nostdinc -nostdlib \ - -fno-omit-frame-pointer -fPIC \ - -Wl,-T${SRCTOP}/sys/${MACHINE_CPUARCH}/linux${SFX}/${VDSO}.lds.s \ - -Wl,-soname=${VDSO}.so.1,--eh-frame-hdr,-warn-common \ + ${CC} -c -x assembler-with-cpp -DLOCORE -fPIC -pipe -O2 -Werror \ + -msoft-float -mregparm=0 \ + -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} +linux${SFX}_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS} + ${CC} -c -fPIC -pipe -O2 -Werror -msoft-float -mregparm=0 \ + -mcmodel=small -fno-common -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ + ${.IMPSRC} -o ${.TARGET} + +linux${SFX}_vdso.so.o: linux${SFX}_locore.o linux${SFX}_vdso_gtod.o + ${LD} -m elf_i386 --shared --eh-frame-hdr -soname=linux-gate.so.1 \ + --no-undefined --hash-style=both -warn-common -nostdlib \ + --strip-debug -s --build-id=sha1 --Bsymbolic \ + -T${SRCTOP}/sys/${MACHINE}/linux${SFX}/linux${SFX}_vdso.lds.s \ + -o ${.TARGET} ${.ALLSRC:M*.o} + +.if ${MACHINE_CPUARCH} == "amd64" +OBJCOPY_TARGET=--output-target elf64-x86-64-freebsd --binary-architecture i386 +.elif ${MACHINE_CPUARCH} == "i386" +OBJCOPY_TARGET=--output-target elf32-i386-freebsd --binary-architecture i386 +.else +.error ${MACHINE_CPUARCH} not yet supported by linux +.endif + +linux${SFX}_vdso.so: linux${SFX}_vdso.so.o + ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \ + linux${SFX}_vdso.so.o ${.TARGET} + ${STRIPBIN} -N _binary_linux${SFX}_vdso_so_o_size ${.TARGET} + .if ${MACHINE_CPUARCH} == "amd64" linux${SFX}_support.o: linux${SFX}_assym.h assym.inc ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.IMPSRC} -o ${.TARGET} - -${VDSO}.so: linux${SFX}_locore.o - ${OBJCOPY} --input-target binary --output-target elf64-x86-64-freebsd \ - --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux${SFX}_locore_o_size ${.TARGET} -.else -${VDSO}.so: linux${SFX}_locore.o - ${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd \ - --binary-architecture i386 linux${SFX}_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET} .endif linux${SFX}_genassym.o: offset.inc Index: sys/modules/linux64/Makefile =================================================================== --- sys/modules/linux64/Makefile +++ sys/modules/linux64/Makefile @@ -5,8 +5,6 @@ .PATH: ${SRCTOP}/sys/x86/linux .endif -VDSO= linux_vdso - KMOD= linux64 SRCS= linux_fork.c linux_dummy_machdep.c linux_file.c linux_event.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ @@ -17,7 +15,7 @@ vnode_if.h device_if.h bus_if.h \ linux_support.s .if ${MACHINE_CPUARCH} == "amd64" -SRCS+= linux_dummy_x86.c +SRCS+= linux_dummy_x86.c linux_vdso_tsc_selector_x86.c .endif DPSRCS= assym.inc linux_genassym.c @@ -25,20 +23,42 @@ SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h CLEANFILES= linux_assym.h linux_genassym.o linux_locore.o \ - genassym.o + genassym.o linux_vdso_gtod.o linux_vdso.so.o -OBJS= ${VDSO}.so +OBJS= linux_vdso.so linux_assym.h: linux_genassym.o sh ${SYSDIR}/kern/genassym.sh linux_genassym.o > ${.TARGET} -linux_locore.o: linux_locore.asm linux_assym.h - ${CC} ${CCLDFLAGS} -x assembler-with-cpp -DLOCORE -shared -mcmodel=small \ - -pipe -I. -I${SYSDIR} ${WERROR} -Wall -fno-common -fPIC -nostdinc \ - -Wl,-T${SRCTOP}/sys/${MACHINE}/linux/${VDSO}.lds.s \ - -Wl,-soname=${VDSO}.so.1,-warn-common -nostdlib \ +.if ${MACHINE_CPUARCH} == "amd64" +VDSOFLAGS=-mregparm=0 -mcmodel=small -msoft-float +VDSODEPS=linux_vdso_gettc_x86.inc +.elif ${MACHINE_CPUARCH} == "aarch64" +VDSOFLAGS=-mgeneral-regs-only -mcmodel=small -ffixed-x18 +.endif + +linux_locore.o: linux_assym.h assym.inc + ${CC} -c -x assembler-with-cpp -DLOCORE \ + -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \ + -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} +linux_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS} + ${CC} -c -fPIC -pipe -O2 -Werror ${VDSOFLAGS} \ + -nostdinc -fasynchronous-unwind-tables \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ + ${.IMPSRC} -o ${.TARGET} + +linux_vdso.so.o: linux_locore.o linux_vdso_gtod.o + ${LD} --shared --eh-frame-hdr -soname=linux-vdso.so.1 \ + --no-undefined --hash-style=both -warn-common -nostdlib \ + --strip-debug -s --build-id=sha1 -Bsymbolic \ + -T${SRCTOP}/sys/${MACHINE}/linux/linux_vdso.lds.s \ + -o ${.TARGET} ${.ALLSRC:M*.o} + .if ${MACHINE_CPUARCH} == "aarch64" OBJCOPY_TARGET=--output-target elf64-littleaarch64 --binary-architecture aarch64 .elif ${MACHINE_CPUARCH} == "amd64" @@ -46,10 +66,11 @@ .else .error ${MACHINE_CPUARCH} not yet supported by linux64 .endif -${VDSO}.so: linux_locore.o + +linux_vdso.so: linux_vdso.so.o ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \ - linux_locore.o ${.TARGET} - ${STRIPBIN} -N _binary_linux_locore_o_size ${.TARGET} + linux_vdso.so.o ${.TARGET} + ${STRIPBIN} -N _binary_linux_vdso_so_o_size ${.TARGET} linux_support.o: assym.inc linux_assym.h ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ Index: sys/sys/sysent.h =================================================================== --- sys/sys/sysent.h +++ sys/sys/sysent.h @@ -145,7 +145,7 @@ u_long *sv_hwcap2; /* Value passed in AT_HWCAP2. */ const char *(*sv_machine_arch)(struct proc *); vm_offset_t sv_fxrng_gen_base; - void (*sv_onexec)(struct proc *, struct image_params *); + int (*sv_onexec)(struct proc *, struct image_params *); void (*sv_onexit)(struct proc *); void (*sv_ontdexit)(struct thread *td); int (*sv_setid_allowed)(struct thread *td, Index: sys/x86/linux/linux_vdso_gettc_x86.inc =================================================================== --- /dev/null +++ sys/x86/linux/linux_vdso_gettc_x86.inc @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2016, 2017, 2019 The FreeBSD Foundation + * Copyright (c) 2021 Dmitry Chagin + * All rights reserved. + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#if defined(__i386__) || defined(COMPAT_LINUX32) +#include +#include +#else +#include +#include +#endif + +static inline u_int +rdtsc_low(const struct vdso_timehands *th) +{ + u_int rv; + + __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" + : "=a" (rv) : "c" (th->th_x86_shift) : "edx"); + return (rv); +} + +static inline u_int +rdtscp_low(const struct vdso_timehands *th) +{ + u_int rv; + + __asm __volatile("rdtscp; movl %%edi,%%ecx; shrd %%cl, %%edx, %0" + : "=a" (rv) : "D" (th->th_x86_shift) : "ecx", "edx"); + return (rv); +} + +static u_int +rdtsc_low_mb_lfence(const struct vdso_timehands *th) +{ + lfence(); + return (rdtsc_low(th)); +} + +static u_int +rdtsc_low_mb_mfence(const struct vdso_timehands *th) +{ + mfence(); + return (rdtsc_low(th)); +} + +static u_int +rdtsc_low_mb_none(const struct vdso_timehands *th) +{ + return (rdtsc_low(th)); +} + +static u_int +rdtsc32_mb_lfence(void) +{ + lfence(); + return (rdtsc32()); +} + +static u_int +rdtsc32_mb_mfence(void) +{ + mfence(); + return (rdtsc32()); +} + +static u_int +rdtsc32_mb_none(void) +{ + return (rdtsc32()); +} + +static u_int +rdtscp32_(void) +{ + return (rdtscp32()); +} + +struct tsc_selector_tag { + u_int (*ts_rdtsc32)(void); + u_int (*ts_rdtsc_low)(const struct vdso_timehands *); +}; + +static const struct tsc_selector_tag tsc_selector[] = { + [0] = { /* Intel, LFENCE */ + .ts_rdtsc32 = rdtsc32_mb_lfence, + .ts_rdtsc_low = rdtsc_low_mb_lfence, + }, + [1] = { /* AMD, MFENCE */ + .ts_rdtsc32 = rdtsc32_mb_mfence, + .ts_rdtsc_low = rdtsc_low_mb_mfence, + }, + [2] = { /* No SSE2 */ + .ts_rdtsc32 = rdtsc32_mb_none, + .ts_rdtsc_low = rdtsc_low_mb_none, + }, + [3] = { /* RDTSCP */ + .ts_rdtsc32 = rdtscp32_, + .ts_rdtsc_low = rdtscp_low, + }, +}; + +static u_int +__vdso_gettc_rdtsc_low(const struct vdso_timehands *th) +{ + + return (tsc_selector[kern_tsc_selector].ts_rdtsc_low(th)); +} + +static u_int +__vdso_gettc_rdtsc32(void) +{ + + return (tsc_selector[kern_tsc_selector].ts_rdtsc32()); +} + +int +__vdso_gettc(const struct vdso_timehands *th, u_int *tc) +{ + + switch (th->th_algo) { + case VDSO_TH_ALGO_X86_TSC: + *tc = th->th_x86_shift > 0 ? __vdso_gettc_rdtsc_low(th) : + __vdso_gettc_rdtsc32(); + return (0); + case VDSO_TH_ALGO_X86_HPET: + /* TODO */ + default: + return (ENOSYS); + } +} Index: sys/x86/linux/linux_vdso_tsc_selector_x86.c =================================================================== --- /dev/null +++ sys/x86/linux/linux_vdso_tsc_selector_x86.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include + +int +linux_vdso_tsc_selector_idx() +{ + bool amd_cpu; + + if (cpu_feature == 0) + return (2); /* should not happen due to RDTSC */ + + amd_cpu = (cpu_vendor_id == CPU_VENDOR_AMD || + cpu_vendor_id == CPU_VENDOR_HYGON); + + if ((amd_feature & AMDID_RDTSCP) != 0) + return (3); + if ((cpu_feature & CPUID_SSE2) == 0) + return (2); + return (amd_cpu ? 1 : 0); +} Index: sys/x86/linux/linux_x86.h =================================================================== --- /dev/null +++ sys/x86/linux/linux_x86.h @@ -0,0 +1,33 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Dmitry Chagin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _X86_INCLUDE_LINUX_LINUX_X86_H_ +#define _X86_INCLUDE_LINUX_LINUX_X86_H_ + +int linux_vdso_tsc_selector_idx(void); + +#endif /* _X86_INCLUDE_LINUX_LINUX_X86_H_ */