diff --git a/sys/amd64/linux/linux.h b/sys/amd64/linux/linux.h index 4e736cc11c22..9080bcb36cbc 100644 --- a/sys/amd64/linux/linux.h +++ b/sys/amd64/linux/linux.h @@ -1,465 +1,465 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _AMD64_LINUX_H_ #define _AMD64_LINUX_H_ #include #include #include #define LINUX_LEGACY_SYSCALLS #define LINUX_DTRACE linuxulator /* * Provide a separate set of types for the Linux types. */ typedef int32_t l_int; typedef int64_t l_long; typedef int16_t l_short; typedef uint32_t l_uint; typedef uint64_t l_ulong; typedef uint16_t l_ushort; typedef l_ulong l_uintptr_t; typedef l_long l_clock_t; typedef l_int l_daddr_t; typedef l_ulong l_dev_t; typedef l_uint l_gid_t; typedef l_ushort l_gid16_t; typedef l_uint l_uid_t; typedef l_ushort l_uid16_t; typedef l_ulong l_ino_t; typedef l_int l_key_t; typedef l_long l_loff_t; typedef l_uint l_mode_t; typedef l_long l_off_t; typedef l_int l_pid_t; typedef l_ulong l_size_t; typedef l_long l_ssize_t; typedef l_long l_suseconds_t; typedef l_long l_time_t; typedef l_int l_timer_t; typedef l_int l_mqd_t; typedef l_size_t l_socklen_t; typedef l_ulong l_fd_mask; typedef struct { l_int val[2]; } l_fsid_t; typedef struct { l_time_t tv_sec; l_suseconds_t tv_usec; } l_timeval; #define l_fd_set fd_set /* * Miscellaneous */ #define LINUX_AT_COUNT 20 /* Count of used aux entry types. */ struct l___sysctl_args { l_uintptr_t name; l_int nlen; l_uintptr_t oldval; l_uintptr_t oldlenp; l_uintptr_t newval; l_size_t newlen; l_ulong __spare[4]; }; /* Resource limits */ #define LINUX_RLIMIT_CPU 0 #define LINUX_RLIMIT_FSIZE 1 #define LINUX_RLIMIT_DATA 2 #define LINUX_RLIMIT_STACK 3 #define LINUX_RLIMIT_CORE 4 #define LINUX_RLIMIT_RSS 5 #define LINUX_RLIMIT_NPROC 6 #define LINUX_RLIMIT_NOFILE 7 #define LINUX_RLIMIT_MEMLOCK 8 #define LINUX_RLIMIT_AS 9 /* Address space limit */ #define LINUX_RLIM_NLIMITS 10 struct l_rlimit { l_ulong rlim_cur; l_ulong rlim_max; }; /* * stat family of syscalls */ struct l_timespec { l_time_t tv_sec; l_long tv_nsec; }; struct l_newstat { l_dev_t st_dev; l_ino_t st_ino; l_ulong st_nlink; l_uint st_mode; l_uid_t st_uid; l_gid_t st_gid; l_uint __st_pad1; l_dev_t st_rdev; l_off_t st_size; l_long st_blksize; l_long st_blocks; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_long __unused1; l_long __unused2; l_long __unused3; }; /* sigaction flags */ #define LINUX_SA_NOCLDSTOP 0x00000001 #define LINUX_SA_NOCLDWAIT 0x00000002 #define LINUX_SA_SIGINFO 0x00000004 #define LINUX_SA_RESTORER 0x04000000 #define LINUX_SA_ONSTACK 0x08000000 #define LINUX_SA_RESTART 0x10000000 #define LINUX_SA_INTERRUPT 0x20000000 #define LINUX_SA_NOMASK 0x40000000 #define LINUX_SA_ONESHOT 0x80000000 /* sigprocmask actions */ #define LINUX_SIG_BLOCK 0 #define LINUX_SIG_UNBLOCK 1 #define LINUX_SIG_SETMASK 2 /* sigaltstack */ #define LINUX_MINSIGSTKSZ 2048 typedef void (*l_handler_t)(l_int); typedef struct { l_handler_t lsa_handler; l_ulong lsa_flags; l_uintptr_t lsa_restorer; l_sigset_t lsa_mask; } l_sigaction_t; typedef struct { l_uintptr_t ss_sp; l_int ss_flags; l_size_t ss_size; } l_stack_t; struct l_fpstate { u_int16_t cwd; u_int16_t swd; u_int16_t twd; u_int16_t fop; u_int64_t rip; u_int64_t rdp; u_int32_t mxcsr; u_int32_t mxcsr_mask; u_int32_t st_space[32]; u_int32_t xmm_space[64]; u_int32_t reserved2[24]; }; struct l_sigcontext { l_ulong sc_r8; l_ulong sc_r9; l_ulong sc_r10; l_ulong sc_r11; l_ulong sc_r12; l_ulong sc_r13; l_ulong sc_r14; l_ulong sc_r15; l_ulong sc_rdi; l_ulong sc_rsi; l_ulong sc_rbp; l_ulong sc_rbx; l_ulong sc_rdx; l_ulong sc_rax; l_ulong sc_rcx; l_ulong sc_rsp; l_ulong sc_rip; l_ulong sc_rflags; l_ushort sc_cs; l_ushort sc_gs; l_ushort sc_fs; l_ushort sc___pad0; l_ulong sc_err; l_ulong sc_trapno; l_sigset_t sc_mask; l_ulong sc_cr2; struct l_fpstate *sc_fpstate; l_ulong sc_reserved1[8]; }; struct l_ucontext { l_ulong uc_flags; l_uintptr_t uc_link; l_stack_t uc_stack; struct l_sigcontext uc_mcontext; l_sigset_t uc_sigmask; }; #define LINUX_SI_PREAMBLE_SIZE (4 * sizeof(int)) #define LINUX_SI_MAX_SIZE 128 #define LINUX_SI_PAD_SIZE ((LINUX_SI_MAX_SIZE - \ LINUX_SI_PREAMBLE_SIZE) / sizeof(l_int)) typedef union l_sigval { l_int sival_int; l_uintptr_t sival_ptr; } l_sigval_t; typedef struct l_siginfo { l_int lsi_signo; l_int lsi_errno; l_int lsi_code; union { l_int _pad[LINUX_SI_PAD_SIZE]; struct { l_pid_t _pid; l_uid_t _uid; } _kill; struct { l_timer_t _tid; l_int _overrun; char _pad[sizeof(l_uid_t) - sizeof(int)]; union l_sigval _sigval; l_uint _sys_private; } _timer; struct { l_pid_t _pid; /* sender's pid */ l_uid_t _uid; /* sender's uid */ union l_sigval _sigval; } _rt; struct { l_pid_t _pid; /* which child */ l_uid_t _uid; /* sender's uid */ l_int _status; /* exit code */ l_clock_t _utime; l_clock_t _stime; } _sigchld; struct { l_uintptr_t _addr; /* Faulting insn/memory ref. */ } _sigfault; struct { l_long _band; /* POLL_IN,POLL_OUT,POLL_MSG */ l_int _fd; } _sigpoll; } _sifields; } l_siginfo_t; #define lsi_pid _sifields._kill._pid #define lsi_uid _sifields._kill._uid #define lsi_tid _sifields._timer._tid #define lsi_overrun _sifields._timer._overrun #define lsi_sys_private _sifields._timer._sys_private #define lsi_status _sifields._sigchld._status #define lsi_utime _sifields._sigchld._utime #define lsi_stime _sifields._sigchld._stime #define lsi_value _sifields._rt._sigval #define lsi_int _sifields._rt._sigval.sival_int #define lsi_ptr _sifields._rt._sigval.sival_ptr #define lsi_addr _sifields._sigfault._addr #define lsi_band _sifields._sigpoll._band #define lsi_fd _sifields._sigpoll._fd /* * We make the stack look like Linux expects it when calling a signal * handler, but use the BSD way of calling the handler and sigreturn(). * This means that we need to pass the pointer to the handler too. * It is appended to the frame to not interfere with the rest of it. */ struct l_rt_sigframe { struct l_ucontext sf_sc; struct l_siginfo sf_si; l_handler_t sf_handler; }; /* * mount flags */ #define LINUX_MS_RDONLY 0x0001 #define LINUX_MS_NOSUID 0x0002 #define LINUX_MS_NODEV 0x0004 #define LINUX_MS_NOEXEC 0x0008 #define LINUX_MS_REMOUNT 0x0020 /* * SystemV IPC defines */ #define LINUX_IPC_RMID 0 #define LINUX_IPC_SET 1 #define LINUX_IPC_STAT 2 #define LINUX_IPC_INFO 3 #define LINUX_SHM_LOCK 11 #define LINUX_SHM_UNLOCK 12 #define LINUX_SHM_STAT 13 #define LINUX_SHM_INFO 14 #define LINUX_SHM_RDONLY 0x1000 #define LINUX_SHM_RND 0x2000 #define LINUX_SHM_REMAP 0x4000 /* semctl commands */ #define LINUX_GETPID 11 #define LINUX_GETVAL 12 #define LINUX_GETALL 13 #define LINUX_GETNCNT 14 #define LINUX_GETZCNT 15 #define LINUX_SETVAL 16 #define LINUX_SETALL 17 #define LINUX_SEM_STAT 18 #define LINUX_SEM_INFO 19 union l_semun { l_int val; l_uintptr_t buf; l_uintptr_t array; l_uintptr_t __buf; l_uintptr_t __pad; }; struct l_ifmap { l_ulong mem_start; l_ulong mem_end; l_ushort base_addr; u_char irq; u_char dma; u_char port; /* 3 bytes spare */ }; struct l_ifreq { union { char ifrn_name[LINUX_IFNAMSIZ]; } ifr_ifrn; union { struct l_sockaddr ifru_addr; struct l_sockaddr ifru_dstaddr; struct l_sockaddr ifru_broadaddr; struct l_sockaddr ifru_netmask; struct l_sockaddr ifru_hwaddr; l_short ifru_flags[1]; l_int ifru_ivalue; l_int ifru_mtu; struct l_ifmap ifru_map; char ifru_slave[LINUX_IFNAMSIZ]; l_uintptr_t ifru_data; } ifr_ifru; }; #define ifr_name ifr_ifrn.ifrn_name /* Interface name */ #define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ #define ifr_ifindex ifr_ifru.ifru_ivalue /* Interface index */ struct l_ifconf { int ifc_len; union { l_uintptr_t ifcu_buf; l_uintptr_t ifcu_req; } ifc_ifcu; }; #define ifc_buf ifc_ifcu.ifcu_buf #define ifc_req ifc_ifcu.ifcu_req #define LINUX_ARCH_SET_GS 0x1001 #define LINUX_ARCH_SET_FS 0x1002 #define LINUX_ARCH_GET_FS 0x1003 #define LINUX_ARCH_GET_GS 0x1004 #define LINUX_ARCH_CET_STATUS 0x3001 #define linux_copyout_rusage(r, u) copyout(r, u, sizeof(*r)) /* robust futexes */ struct linux_robust_list { l_uintptr_t next; }; struct linux_robust_list_head { struct linux_robust_list list; l_long futex_offset; l_uintptr_t pending_list; }; /* This corresponds to 'struct user_regs_struct' in Linux. */ struct linux_pt_regset { l_ulong r15; l_ulong r14; l_ulong r13; l_ulong r12; l_ulong rbp; l_ulong rbx; l_ulong r11; l_ulong r10; l_ulong r9; l_ulong r8; l_ulong rax; l_ulong rcx; l_ulong rdx; l_ulong rsi; l_ulong rdi; l_ulong orig_rax; l_ulong rip; l_ulong cs; l_ulong eflags; l_ulong rsp; l_ulong ss; l_ulong fs_base; l_ulong gs_base; l_ulong ds; l_ulong es; l_ulong fs; l_ulong gs; }; struct reg; void bsd_to_linux_regset(struct reg *b_reg, struct linux_pt_regset *l_regset); #endif /* !_AMD64_LINUX_H_ */ diff --git a/sys/amd64/linux/linux_dummy_machdep.c b/sys/amd64/linux/linux_dummy_machdep.c index aba79b69d55d..4e4d0f9c0d8b 100644 --- a/sys/amd64/linux/linux_dummy_machdep.c +++ b/sys/amd64/linux/linux_dummy_machdep.c @@ -1,74 +1,73 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /* * Before adding new stubs to this file, please check if a stub can be added to * the machine-independent code in sys/compat/linux/linux_dummy.c (or * sys/x86/linux/linux_dummy_x86.c). */ UNIMPLEMENTED(get_thread_area); UNIMPLEMENTED(set_thread_area); UNIMPLEMENTED(uselib); DUMMY(modify_ldt); DUMMY(ioperm); DUMMY(io_setup); DUMMY(io_destroy); DUMMY(io_getevents); DUMMY(io_submit); DUMMY(io_cancel); DUMMY(mq_open); DUMMY(mq_unlink); DUMMY(mq_timedsend); DUMMY(mq_timedreceive); DUMMY(mq_notify); DUMMY(mq_getsetattr); DUMMY(readahead); DUMMY(restart_syscall); DUMMY(semtimedop); /* Linux 3.15: */ DUMMY(kexec_file_load); diff --git a/sys/amd64/linux/linux_machdep.c b/sys/amd64/linux/linux_machdep.c index b54674e793c8..d7b31ce89509 100644 --- a/sys/amd64/linux/linux_machdep.c +++ b/sys/amd64/linux/linux_machdep.c @@ -1,383 +1,383 @@ /*- - * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int linux_execve(struct thread *td, struct linux_execve_args *args) { struct image_args eargs; char *path; int error; LINUX_CTR(execve); if (!LUSECONVPATH(td)) { error = exec_copyin_args(&eargs, args->path, UIO_USERSPACE, args->argp, args->envp); } else { LCONVPATHEXIST(td, args->path, &path); error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, args->envp); LFREEPATH(path); } if (error == 0) error = linux_common_execve(td, &eargs); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } int linux_set_upcall(struct thread *td, register_t stack) { if (stack) td->td_frame->tf_rsp = stack; /* * The newly created Linux thread returns * to the user space by the same path that a parent does. */ td->td_frame->tf_rax = 0; return (0); } int linux_mmap2(struct thread *td, struct linux_mmap2_args *args) { return (linux_mmap_common(td, args->addr, args->len, args->prot, args->flags, args->fd, args->pgoff)); } int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { return (linux_mprotect_common(td, uap->addr, uap->len, uap->prot)); } int linux_madvise(struct thread *td, struct linux_madvise_args *uap) { return (linux_madvise_common(td, uap->addr, uap->len, uap->behav)); } int linux_iopl(struct thread *td, struct linux_iopl_args *args) { int error; LINUX_CTR(iopl); if (args->level > 3) return (EINVAL); if ((error = priv_check(td, PRIV_IO)) != 0) return (error); if ((error = securelevel_gt(td->td_ucred, 0)) != 0) return (error); td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | (args->level * (PSL_IOPL / 3)); return (0); } int linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) { l_sigset_t lmask; sigset_t sigmask; int error; LINUX_CTR2(rt_sigsuspend, "%p, %ld", uap->newset, uap->sigsetsize); if (uap->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); if (error) return (error); linux_to_bsd_sigset(&lmask, &sigmask); return (kern_sigsuspend(td, sigmask)); } int linux_pause(struct thread *td, struct linux_pause_args *args) { struct proc *p = td->td_proc; sigset_t sigmask; LINUX_CTR(pause); PROC_LOCK(p); sigmask = td->td_sigmask; PROC_UNLOCK(p); return (kern_sigsuspend(td, sigmask)); } int linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) { stack_t ss, oss; l_stack_t lss; int error; memset(&lss, 0, sizeof(lss)); LINUX_CTR2(sigaltstack, "%p, %p", uap->uss, uap->uoss); if (uap->uss != NULL) { error = copyin(uap->uss, &lss, sizeof(l_stack_t)); if (error) return (error); ss.ss_sp = PTRIN(lss.ss_sp); ss.ss_size = lss.ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); } error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, (uap->uoss != NULL) ? &oss : NULL); if (!error && uap->uoss != NULL) { lss.ss_sp = PTROUT(oss.ss_sp); lss.ss_size = oss.ss_size; lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); } return (error); } int linux_arch_prctl(struct thread *td, struct linux_arch_prctl_args *args) { unsigned long long cet[3]; struct pcb *pcb; int error; pcb = td->td_pcb; LINUX_CTR2(arch_prctl, "0x%x, %p", args->code, args->addr); switch (args->code) { case LINUX_ARCH_SET_GS: if (args->addr < VM_MAXUSER_ADDRESS) { update_pcb_bases(pcb); pcb->pcb_gsbase = args->addr; td->td_frame->tf_gs = _ugssel; error = 0; } else error = EPERM; break; case LINUX_ARCH_SET_FS: if (args->addr < VM_MAXUSER_ADDRESS) { update_pcb_bases(pcb); pcb->pcb_fsbase = args->addr; td->td_frame->tf_fs = _ufssel; error = 0; } else error = EPERM; break; case LINUX_ARCH_GET_FS: error = copyout(&pcb->pcb_fsbase, PTRIN(args->addr), sizeof(args->addr)); break; case LINUX_ARCH_GET_GS: error = copyout(&pcb->pcb_gsbase, PTRIN(args->addr), sizeof(args->addr)); break; case LINUX_ARCH_CET_STATUS: memset(cet, 0, sizeof(cet)); error = copyout(&cet, PTRIN(args->addr), sizeof(cet)); break; default: linux_msg(td, "unsupported arch_prctl code %#x", args->code); error = EINVAL; } return (error); } int linux_set_cloned_tls(struct thread *td, void *desc) { struct pcb *pcb; if ((uint64_t)desc >= VM_MAXUSER_ADDRESS) return (EPERM); pcb = td->td_pcb; update_pcb_bases(pcb); pcb->pcb_fsbase = (register_t)desc; td->td_frame->tf_fs = _ufssel; return (0); } int futex_xchgl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_xchgl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_xchgl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_xchgl_smap : futex_xchgl_nosmap); } int futex_addl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_addl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_addl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_addl_smap : futex_addl_nosmap); } int futex_orl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_orl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_orl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_orl_smap : futex_orl_nosmap); } int futex_andl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_andl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_andl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_andl_smap : futex_andl_nosmap); } int futex_xorl_nosmap(int oparg, uint32_t *uaddr, int *oldval); int futex_xorl_smap(int oparg, uint32_t *uaddr, int *oldval); DEFINE_IFUNC(, int, futex_xorl, (int, uint32_t *, int *)) { return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ? futex_xorl_smap : futex_xorl_nosmap); } void bsd_to_linux_regset(struct reg *b_reg, struct linux_pt_regset *l_regset) { l_regset->r15 = b_reg->r_r15; l_regset->r14 = b_reg->r_r14; l_regset->r13 = b_reg->r_r13; l_regset->r12 = b_reg->r_r12; l_regset->rbp = b_reg->r_rbp; l_regset->rbx = b_reg->r_rbx; l_regset->r11 = b_reg->r_r11; l_regset->r10 = b_reg->r_r10; l_regset->r9 = b_reg->r_r9; l_regset->r8 = b_reg->r_r8; l_regset->rax = b_reg->r_rax; l_regset->rcx = b_reg->r_rcx; l_regset->rdx = b_reg->r_rdx; l_regset->rsi = b_reg->r_rsi; l_regset->rdi = b_reg->r_rdi; l_regset->orig_rax = b_reg->r_rax; l_regset->rip = b_reg->r_rip; l_regset->cs = b_reg->r_cs; l_regset->eflags = b_reg->r_rflags; l_regset->rsp = b_reg->r_rsp; l_regset->ss = b_reg->r_ss; l_regset->fs_base = 0; l_regset->gs_base = 0; l_regset->ds = b_reg->r_ds; l_regset->es = b_reg->r_es; l_regset->fs = b_reg->r_fs; l_regset->gs = b_reg->r_gs; } diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index fbc397ec139e..dc9a77093bcd 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -1,1061 +1,1061 @@ /*- - * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. + * Copyright (c) 2013, 2021 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux64, 1); #define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX_VDSOPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - \ LINUX_VDSOPAGE_SIZE) #define LINUX_SHAREDPAGE_LA48 (LINUX_VDSOPAGE_LA48 - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX_USRSTACK_LA48 LINUX_SHAREDPAGE_LA48 #define LINUX_PS_STRINGS_LA48 (LINUX_USRSTACK_LA48 - \ sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux_vdso_so_o_start; extern char _binary_linux_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_vsyscall(struct thread *td); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; sa->code = frame->tf_rax; sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_rax = td->td_retval[0]; frame->tf_r10 = frame->tf_rcx; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes, * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. * We saved this in tf_err. * */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; break; case EJUSTRETURN: break; default: frame->tf_rax = bsd_to_linux_errno(error); frame->tf_r10 = frame->tf_rcx; break; } /* * Differently from FreeBSD native ABI, on Linux only %rcx * and %r11 values are not preserved across the syscall. * Require full context restore to get all registers except * those two restored at return to usermode. * * XXX: Would be great to be able to avoid PCB_FULL_IRET * for the error == 0 case. */ set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { Elf_Auxargs *args; Elf_Auxinfo *argarray, *pos; struct proc *p; int error, issetugid; p = imgp->proc; args = (Elf64_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf_Addr *base; base = (Elf64_Addr *)*stack_base; base--; if (suword(base, (uint64_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf64_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * Starting with 2.24, glibc depends on a 16-byte stack alignment. * One "long argc" will be prepended later. */ vectp = (char **)((((uintptr_t)vectp + 8) & ~0xF) - 8); /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; set_pcb_flags(pcb, PCB_FULL_IRET); saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } /* * Copied from amd64/amd64/machdep.c * * XXX fpu state need? don't think so */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_ucontext uc; struct l_sigcontext *context; struct trapframe *regs; unsigned long rflags; int error; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); if (error != 0) return (error); p = td->td_proc; context = &uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; int sig, code; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Save user context. */ bzero(&sf, sizeof(sf)); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe); } else sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig); sf.sf_handler = catcher; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #define LINUX_VSYSCALL_START (-10UL << 20) #define LINUX_VSYSCALL_SZ 1024 const unsigned long linux_vsyscall_vector[] = { LINUX_SYS_gettimeofday, LINUX_SYS_linux_time, LINUX_SYS_linux_getcpu, }; static int linux_vsyscall(struct thread *td) { struct trapframe *frame; uint64_t retqaddr; int code, traced; int error; frame = td->td_frame; /* Check %rip for vsyscall area. */ if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) return (EINVAL); if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) return (EINVAL); code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; if (code >= nitems(linux_vsyscall_vector)) return (EINVAL); /* * vsyscall called as callq *(%rax), so we must * use return address from %rsp and also fixup %rsp. */ error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); if (error) return (error); frame->tf_rip = retqaddr; frame->tf_rax = linux_vsyscall_vector[code]; frame->tf_rsp += 8; traced = (frame->tf_flags & PSL_T); amd64_syscall(td, traced); return (0); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, .sv_usrstack = LINUX_USRSTACK_LA48, .sv_psstrings = LINUX_PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error; error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); if (error == 0) linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; amd64_lower_shared_page(sv); /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_timekeep_base; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux_vdso_so_o_start; char *vdso_end = &_binary_linux_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); linux_vdso_base = LINUX_VDSOPAGE_LA48; if (hw_lower_amd64_sharedpage != 0) linux_vdso_base -= PAGE_SIZE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Ehdr *ehdr; const Elf_Shdr *shdr; Elf64_Addr *where, val; Elf_Size rtype, symidx; const Elf_Rela *rela; Elf_Addr addr, addend; int relacnt; int i, j; MPASS(offset != 0); relacnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: printf("Linux x86_64 vDSO: unexpected Rel section\n"); break; case SHT_RELA: rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset); relacnt = shdr[i].sh_size / sizeof(*rela); } } for (j = 0; j < relacnt; j++, rela++) { where = (Elf_Addr *)(mapping + rela->r_offset); addend = rela->r_addend; rtype = ELF_R_TYPE(rela->r_info); symidx = ELF_R_SYM(rela->r_info); switch (rtype) { case R_X86_64_NONE: /* none */ break; case R_X86_64_RELATIVE: /* B + A */ addr = (Elf_Addr)(offset + addend); val = addr; if (*where != val) *where = val; break; case R_X86_64_IRELATIVE: printf("Linux x86_64 vDSO: unexpected ifunc relocation, " "symbol index %ld\n", symidx); break; default: printf("Linux x86_64 vDSO: unexpected relocation type %ld, " "symbol index %ld\n", rtype, symidx); } } } static char GNULINUX_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNULINUX_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-x86_64.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux x86-64 ELF exec handler installed\n"); } else printf("cannot insert Linux x86-64 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux x86_64 ELF exec handler removed\n"); } else printf("Could not deinstall Linux x86_64 ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "Linux 64bit support"); diff --git a/sys/arm64/linux/linux.h b/sys/arm64/linux/linux.h index 2e683b77f8e8..ab3bab8264f4 100644 --- a/sys/arm64/linux/linux.h +++ b/sys/arm64/linux/linux.h @@ -1,313 +1,313 @@ /*- * Copyright (c) 1994-1996 Søren Schmidt - * Copyright (c) 2013 Dmitry Chagin + * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2018 Turing Robotic Industries Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ */ #ifndef _ARM64_LINUX_H_ #define _ARM64_LINUX_H_ #include #include #include #define LINUX_DTRACE linuxulator /* Provide a separate set of types for the Linux types */ typedef int32_t l_int; typedef int64_t l_long; typedef int16_t l_short; typedef uint32_t l_uint; typedef uint64_t l_ulong; typedef uint16_t l_ushort; typedef l_ulong l_uintptr_t; typedef l_long l_clock_t; typedef l_int l_daddr_t; typedef l_ulong l_dev_t; typedef l_uint l_gid_t; typedef l_ushort l_gid16_t; /* XXX */ typedef l_uint l_uid_t; typedef l_ushort l_uid16_t; /* XXX */ typedef l_ulong l_ino_t; typedef l_int l_key_t; typedef l_long l_loff_t; typedef l_uint l_mode_t; typedef l_long l_off_t; typedef l_int l_pid_t; typedef l_ulong l_size_t; typedef l_long l_suseconds_t; typedef l_long l_time_t; typedef l_int l_timer_t; /* XXX */ typedef l_int l_mqd_t; typedef l_ulong l_fd_mask; typedef struct { l_int val[2]; } l_fsid_t; typedef struct { l_time_t tv_sec; l_suseconds_t tv_usec; } l_timeval; #define l_fd_set fd_set /* Miscellaneous */ #define LINUX_AT_COUNT 20 struct l___sysctl_args { l_uintptr_t name; l_int nlen; l_uintptr_t oldval; l_uintptr_t oldlenp; l_uintptr_t newval; l_uintptr_t newlen; l_ulong __spare[4]; }; /* Resource limits */ #define LINUX_RLIMIT_CPU 0 #define LINUX_RLIMIT_FSIZE 1 #define LINUX_RLIMIT_DATA 2 #define LINUX_RLIMIT_STACK 3 #define LINUX_RLIMIT_CORE 4 #define LINUX_RLIMIT_RSS 5 #define LINUX_RLIMIT_NPROC 6 #define LINUX_RLIMIT_NOFILE 7 #define LINUX_RLIMIT_MEMLOCK 8 #define LINUX_RLIMIT_AS 9 /* Address space limit */ #define LINUX_RLIM_NLIMITS 10 struct l_rlimit { l_ulong rlim_cur; l_ulong rlim_max; }; /* stat family of syscalls */ struct l_timespec { l_time_t tv_sec; l_long tv_nsec; }; struct l_newstat { l_dev_t st_dev; l_ino_t st_ino; l_uint st_mode; l_uint st_nlink; l_uid_t st_uid; l_gid_t st_gid; l_dev_t st_rdev; l_ulong __st_pad1; l_off_t st_size; l_int st_blksize; l_int __st_pad2; l_long st_blocks; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_uint __unused1; l_uint __unused2; }; /* sigaction flags */ #define LINUX_SA_NOCLDSTOP 0x00000001 #define LINUX_SA_NOCLDWAIT 0x00000002 #define LINUX_SA_SIGINFO 0x00000004 #define LINUX_SA_RESTORER 0x04000000 #define LINUX_SA_ONSTACK 0x08000000 #define LINUX_SA_RESTART 0x10000000 #define LINUX_SA_INTERRUPT 0x20000000 /* XXX */ #define LINUX_SA_NOMASK 0x40000000 /* SA_NODEFER */ #define LINUX_SA_ONESHOT 0x80000000 /* SA_RESETHAND */ /* sigprocmask actions */ #define LINUX_SIG_BLOCK 0 #define LINUX_SIG_UNBLOCK 1 #define LINUX_SIG_SETMASK 2 /* sigaltstack */ #define LINUX_MINSIGSTKSZ 2048 /* XXX */ typedef void (*l_handler_t)(l_int); typedef struct { l_handler_t lsa_handler; l_ulong lsa_flags; l_uintptr_t lsa_restorer; l_sigset_t lsa_mask; } l_sigaction_t; /* XXX */ typedef struct { l_uintptr_t ss_sp; l_int ss_flags; l_size_t ss_size; } l_stack_t; #define LINUX_SI_PREAMBLE_SIZE (4 * sizeof(int)) #define LINUX_SI_MAX_SIZE 128 #define LINUX_SI_PAD_SIZE ((LINUX_SI_MAX_SIZE - \ LINUX_SI_PREAMBLE_SIZE) / sizeof(l_int)) typedef union l_sigval { l_int sival_int; l_uintptr_t sival_ptr; } l_sigval_t; typedef struct l_siginfo { l_int lsi_signo; l_int lsi_errno; l_int lsi_code; union { l_int _pad[LINUX_SI_PAD_SIZE]; struct { l_pid_t _pid; l_uid_t _uid; } _kill; struct { l_timer_t _tid; l_int _overrun; char _pad[sizeof(l_uid_t) - sizeof(int)]; union l_sigval _sigval; l_uint _sys_private; } _timer; struct { l_pid_t _pid; /* sender's pid */ l_uid_t _uid; /* sender's uid */ union l_sigval _sigval; } _rt; struct { l_pid_t _pid; /* which child */ l_uid_t _uid; /* sender's uid */ l_int _status; /* exit code */ l_clock_t _utime; l_clock_t _stime; } _sigchld; struct { l_uintptr_t _addr; /* Faulting insn/memory ref. */ } _sigfault; struct { l_long _band; /* POLL_IN,POLL_OUT,POLL_MSG */ l_int _fd; } _sigpoll; } _sifields; } l_siginfo_t; #define lsi_pid _sifields._kill._pid #define lsi_uid _sifields._kill._uid #define lsi_tid _sifields._timer._tid #define lsi_overrun _sifields._timer._overrun #define lsi_sys_private _sifields._timer._sys_private #define lsi_status _sifields._sigchld._status #define lsi_utime _sifields._sigchld._utime #define lsi_stime _sifields._sigchld._stime #define lsi_value _sifields._rt._sigval #define lsi_int _sifields._rt._sigval.sival_int #define lsi_ptr _sifields._rt._sigval.sival_ptr #define lsi_addr _sifields._sigfault._addr #define lsi_band _sifields._sigpoll._band #define lsi_fd _sifields._sigpoll._fd union l_semun { l_int val; l_uintptr_t buf; l_uintptr_t array; l_uintptr_t __buf; l_uintptr_t __pad; }; struct l_ifmap { l_ulong mem_start; l_ulong mem_end; l_ushort base_addr; u_char irq; u_char dma; u_char port; /* 3 bytes spare*/ }; struct l_ifreq { union { char ifrn_name[LINUX_IFNAMSIZ]; } ifr_ifrn; union { struct l_sockaddr ifru_addr; struct l_sockaddr ifru_dstaddr; struct l_sockaddr ifru_broadaddr; struct l_sockaddr ifru_netmask; struct l_sockaddr ifru_hwaddr; l_short ifru_flags[1]; l_int ifru_ivalue; l_int ifru_mtu; struct l_ifmap ifru_map; char ifru_slave[LINUX_IFNAMSIZ]; l_uintptr_t ifru_data; } ifr_ifru; }; #define ifr_name ifr_ifrn.ifrn_name /* Interface name */ #define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ #define ifr_ifindex ifr_ifru.ifru_ivalue /* Interface index */ #define linux_copyout_rusage(r, u) copyout(r, u, sizeof(*r)) /* robust futexes */ struct linux_robust_list { l_uintptr_t next; }; struct linux_robust_list_head { struct linux_robust_list list; l_long futex_offset; l_uintptr_t pending_list; }; struct linux_pt_regset { l_ulong x[31]; l_ulong sp; l_ulong pc; l_ulong cpsr; }; struct reg; void bsd_to_linux_regset(struct reg *b_reg, struct linux_pt_regset *l_regset); #endif /* _ARM64_LINUX_H_ */ diff --git a/sys/arm64/linux/linux_dummy_machdep.c b/sys/arm64/linux/linux_dummy_machdep.c index 4bd339de4af5..0b722c632709 100644 --- a/sys/arm64/linux/linux_dummy_machdep.c +++ b/sys/arm64/linux/linux_dummy_machdep.c @@ -1,64 +1,63 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /* * Before adding new stubs to this file, please check if a stub can be added to * the machine-independent code in sys/compat/linux/linux_dummy.c. */ UNIMPLEMENTED(get_thread_area); UNIMPLEMENTED(set_thread_area); UNIMPLEMENTED(uselib); DUMMY(mq_open); DUMMY(mq_unlink); DUMMY(mq_timedsend); DUMMY(mq_timedreceive); DUMMY(mq_notify); DUMMY(mq_getsetattr); DUMMY(semtimedop); DUMMY(kexec_file_load); diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c index 350d2c1abaf9..e72e5cbc7bfc 100644 --- a/sys/compat/linux/linux.c +++ b/sys/compat/linux/linux.c @@ -1,725 +1,724 @@ /*- - * Copyright (c) 2015 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2015 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct futex_list futex_list; struct mtx futex_mtx; /* protects the futex list */ CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ); static int bsd_to_linux_sigtbl[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, /* SIGHUP */ LINUX_SIGINT, /* SIGINT */ LINUX_SIGQUIT, /* SIGQUIT */ LINUX_SIGILL, /* SIGILL */ LINUX_SIGTRAP, /* SIGTRAP */ LINUX_SIGABRT, /* SIGABRT */ 0, /* SIGEMT */ LINUX_SIGFPE, /* SIGFPE */ LINUX_SIGKILL, /* SIGKILL */ LINUX_SIGBUS, /* SIGBUS */ LINUX_SIGSEGV, /* SIGSEGV */ LINUX_SIGSYS, /* SIGSYS */ LINUX_SIGPIPE, /* SIGPIPE */ LINUX_SIGALRM, /* SIGALRM */ LINUX_SIGTERM, /* SIGTERM */ LINUX_SIGURG, /* SIGURG */ LINUX_SIGSTOP, /* SIGSTOP */ LINUX_SIGTSTP, /* SIGTSTP */ LINUX_SIGCONT, /* SIGCONT */ LINUX_SIGCHLD, /* SIGCHLD */ LINUX_SIGTTIN, /* SIGTTIN */ LINUX_SIGTTOU, /* SIGTTOU */ LINUX_SIGIO, /* SIGIO */ LINUX_SIGXCPU, /* SIGXCPU */ LINUX_SIGXFSZ, /* SIGXFSZ */ LINUX_SIGVTALRM,/* SIGVTALRM */ LINUX_SIGPROF, /* SIGPROF */ LINUX_SIGWINCH, /* SIGWINCH */ 0, /* SIGINFO */ LINUX_SIGUSR1, /* SIGUSR1 */ LINUX_SIGUSR2 /* SIGUSR2 */ }; static int linux_to_bsd_sigtbl[LINUX_SIGTBLSZ] = { SIGHUP, /* LINUX_SIGHUP */ SIGINT, /* LINUX_SIGINT */ SIGQUIT, /* LINUX_SIGQUIT */ SIGILL, /* LINUX_SIGILL */ SIGTRAP, /* LINUX_SIGTRAP */ SIGABRT, /* LINUX_SIGABRT */ SIGBUS, /* LINUX_SIGBUS */ SIGFPE, /* LINUX_SIGFPE */ SIGKILL, /* LINUX_SIGKILL */ SIGUSR1, /* LINUX_SIGUSR1 */ SIGSEGV, /* LINUX_SIGSEGV */ SIGUSR2, /* LINUX_SIGUSR2 */ SIGPIPE, /* LINUX_SIGPIPE */ SIGALRM, /* LINUX_SIGALRM */ SIGTERM, /* LINUX_SIGTERM */ SIGBUS, /* LINUX_SIGSTKFLT */ SIGCHLD, /* LINUX_SIGCHLD */ SIGCONT, /* LINUX_SIGCONT */ SIGSTOP, /* LINUX_SIGSTOP */ SIGTSTP, /* LINUX_SIGTSTP */ SIGTTIN, /* LINUX_SIGTTIN */ SIGTTOU, /* LINUX_SIGTTOU */ SIGURG, /* LINUX_SIGURG */ SIGXCPU, /* LINUX_SIGXCPU */ SIGXFSZ, /* LINUX_SIGXFSZ */ SIGVTALRM, /* LINUX_SIGVTALARM */ SIGPROF, /* LINUX_SIGPROF */ SIGWINCH, /* LINUX_SIGWINCH */ SIGIO, /* LINUX_SIGIO */ /* * FreeBSD does not have SIGPWR signal, map Linux SIGPWR signal * to the first unused FreeBSD signal number. Since Linux supports * signals from 1 to 64 we are ok here as our SIGRTMIN = 65. */ SIGRTMIN, /* LINUX_SIGPWR */ SIGSYS /* LINUX_SIGSYS */ }; static struct cdev *dev_shm_cdev; static struct cdevsw dev_shm_cdevsw = { .d_version = D_VERSION, .d_name = "dev_shm", }; /* * Map Linux RT signals to the FreeBSD RT signals. */ static inline int linux_to_bsd_rt_signal(int sig) { return (SIGRTMIN + 1 + sig - LINUX_SIGRTMIN); } static inline int bsd_to_linux_rt_signal(int sig) { return (sig - SIGRTMIN - 1 + LINUX_SIGRTMIN); } int linux_to_bsd_signal(int sig) { KASSERT(sig > 0 && sig <= LINUX_SIGRTMAX, ("invalid Linux signal %d\n", sig)); if (sig < LINUX_SIGRTMIN) return (linux_to_bsd_sigtbl[_SIG_IDX(sig)]); return (linux_to_bsd_rt_signal(sig)); } int bsd_to_linux_signal(int sig) { if (sig <= LINUX_SIGTBLSZ) return (bsd_to_linux_sigtbl[_SIG_IDX(sig)]); if (sig == SIGRTMIN) return (LINUX_SIGPWR); return (bsd_to_linux_rt_signal(sig)); } int linux_to_bsd_sigaltstack(int lsa) { int bsa = 0; if (lsa & LINUX_SS_DISABLE) bsa |= SS_DISABLE; /* * Linux ignores SS_ONSTACK flag for ss * parameter while FreeBSD prohibits it. */ return (bsa); } int bsd_to_linux_sigaltstack(int bsa) { int lsa = 0; if (bsa & SS_DISABLE) lsa |= LINUX_SS_DISABLE; if (bsa & SS_ONSTACK) lsa |= LINUX_SS_ONSTACK; return (lsa); } void linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss) { int b, l; SIGEMPTYSET(*bss); for (l = 1; l <= LINUX_SIGRTMAX; l++) { if (LINUX_SIGISMEMBER(*lss, l)) { b = linux_to_bsd_signal(l); if (b) SIGADDSET(*bss, b); } } } void bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss) { int b, l; LINUX_SIGEMPTYSET(*lss); for (b = 1; b <= SIGRTMAX; b++) { if (SIGISMEMBER(*bss, b)) { l = bsd_to_linux_signal(b); if (l) LINUX_SIGADDSET(*lss, l); } } } /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ struct ifnet * ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int index; bool is_eth, is_lo; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len]) || lxname[len] == '\0') break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); /* Linux loopback interface name is lo (not lo0) */ is_lo = (len == 2 && strncmp(lxname, "lo", len) == 0); unit = (int)strtoul(lxname + len, &ep, 10); if ((ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) && is_lo == 0) return (NULL); index = 0; is_eth = (len == 3 && strncmp(lxname, "eth", len) == 0); CURVNET_SET(TD_TO_VNET(td)); IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; if (is_lo && IFP_IS_LOOP(ifp)) break; } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (ifp != NULL && bsdname != NULL) strlcpy(bsdname, ifp->if_xname, IFNAMSIZ); return (ifp); } void linux_ifflags(struct ifnet *ifp, short *flags) { unsigned short fl; fl = (ifp->if_flags | ifp->if_drv_flags) & 0xffff; *flags = 0; if (fl & IFF_UP) *flags |= LINUX_IFF_UP; if (fl & IFF_BROADCAST) *flags |= LINUX_IFF_BROADCAST; if (fl & IFF_DEBUG) *flags |= LINUX_IFF_DEBUG; if (fl & IFF_LOOPBACK) *flags |= LINUX_IFF_LOOPBACK; if (fl & IFF_POINTOPOINT) *flags |= LINUX_IFF_POINTOPOINT; if (fl & IFF_DRV_RUNNING) *flags |= LINUX_IFF_RUNNING; if (fl & IFF_NOARP) *flags |= LINUX_IFF_NOARP; if (fl & IFF_PROMISC) *flags |= LINUX_IFF_PROMISC; if (fl & IFF_ALLMULTI) *flags |= LINUX_IFF_ALLMULTI; if (fl & IFF_MULTICAST) *flags |= LINUX_IFF_MULTICAST; } int linux_ifhwaddr(struct ifnet *ifp, struct l_sockaddr *lsa) { struct ifaddr *ifa; struct sockaddr_dl *sdl; if (IFP_IS_LOOP(ifp)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_LOOPBACK; return (0); } if (!IFP_IS_ETH(ifp)) return (ENOENT); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa->sa_data, LINUX_IFHWADDRLEN); return (0); } } return (ENOENT); } int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } /* * Based on the fact that: * 1. Native and Linux storage of struct sockaddr * and struct sockaddr_in6 are equal. * 2. On Linux sa_family is the first member of all struct sockaddr. */ int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len) { struct l_sockaddr *kosa; int error, bdom; *lsa = NULL; if (len < 2 || len > UCHAR_MAX) return (EINVAL); kosa = malloc(len, M_SONAME, M_WAITOK); bcopy(sa, kosa, len); bdom = bsd_to_linux_domain(sa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } kosa->sa_family = bdom; *lsa = kosa; return (0); out: free(kosa, M_SONAME); return (error); } int linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, socklen_t *len) { struct sockaddr *sa; struct l_sockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; bool oldv6size; #endif char *name; int salen, bdom, error, hdrlen, namelen; if (*len < 2 || *len > UCHAR_MAX) return (EINVAL); salen = *len; #ifdef INET6 oldv6size = false; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = true; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, *len))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { linux_msg(curthread, "obsolete pre-RFC2553 sockaddr_in6 rejected"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; *len = salen; return (0); out: free(kosa, M_SONAME); return (error); } void linux_dev_shm_create(void) { int error; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev_shm_cdev, &dev_shm_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0, "shm/.mountpoint"); if (error != 0) { printf("%s: failed to create device node, error %d\n", __func__, error); } } void linux_dev_shm_destroy(void) { destroy_dev(dev_shm_cdev); } int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value) { int bsd_mask, bsd_value, linux_mask, linux_value; int linux_ret; size_t i; bool applied; applied = false; linux_ret = 0; for (i = 0; i < mapcnt; ++i) { bsd_mask = bitmap[i].bsd_mask; bsd_value = bitmap[i].bsd_value; if (bsd_mask == 0) bsd_mask = bsd_value; linux_mask = bitmap[i].linux_mask; linux_value = bitmap[i].linux_value; if (linux_mask == 0) linux_mask = linux_value; /* * If a mask larger than just the value is set, we explicitly * want to make sure that only this bit we mapped within that * mask is set. */ if ((value & bsd_mask) == bsd_value) { linux_ret = (linux_ret & ~linux_mask) | linux_value; applied = true; } } if (!applied) return (no_value); return (linux_ret); } int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value) { int bsd_mask, bsd_value, linux_mask, linux_value; int bsd_ret; size_t i; bool applied; applied = false; bsd_ret = 0; for (i = 0; i < mapcnt; ++i) { bsd_mask = bitmap[i].bsd_mask; bsd_value = bitmap[i].bsd_value; if (bsd_mask == 0) bsd_mask = bsd_value; linux_mask = bitmap[i].linux_mask; linux_value = bitmap[i].linux_value; if (linux_mask == 0) linux_mask = linux_value; /* * If a mask larger than just the value is set, we explicitly * want to make sure that only this bit we mapped within that * mask is set. */ if ((value & linux_mask) == linux_value) { bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value; applied = true; } } if (!applied) return (no_value); return (bsd_ret); } void linux_to_bsd_poll_events(struct thread *td, int fd, short lev, short *bev) { struct proc *p = td->td_proc; struct filedesc *fdp; struct file *fp; int error; short bits = 0; if (lev & LINUX_POLLIN) bits |= POLLIN; if (lev & LINUX_POLLPRI) bits |= POLLPRI; if (lev & LINUX_POLLOUT) bits |= POLLOUT; if (lev & LINUX_POLLERR) bits |= POLLERR; if (lev & LINUX_POLLHUP) bits |= POLLHUP; if (lev & LINUX_POLLNVAL) bits |= POLLNVAL; if (lev & LINUX_POLLRDNORM) bits |= POLLRDNORM; if (lev & LINUX_POLLRDBAND) bits |= POLLRDBAND; if (lev & LINUX_POLLWRBAND) bits |= POLLWRBAND; if (lev & LINUX_POLLWRNORM) bits |= POLLWRNORM; if (lev & LINUX_POLLRDHUP) { /* * It seems that the Linux silencly ignores POLLRDHUP * on non-socket file descriptors unlike FreeBSD, where * events bits is more strictly checked (POLLSTANDARD). */ fdp = p->p_fd; error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error == 0) { /* * XXX. On FreeBSD POLLRDHUP applies only to * stream sockets. */ if (fp->f_type == DTYPE_SOCKET) bits |= POLLRDHUP; fdrop(fp, td); } } if (lev & LINUX_POLLMSG) LINUX_RATELIMIT_MSG_OPT1("unsupported POLLMSG, events(%d)", lev); if (lev & LINUX_POLLREMOVE) LINUX_RATELIMIT_MSG_OPT1("unsupported POLLREMOVE, events(%d)", lev); *bev = bits; } void bsd_to_linux_poll_events(short bev, short *lev) { short bits = 0; if (bev & POLLIN) bits |= LINUX_POLLIN; if (bev & POLLPRI) bits |= LINUX_POLLPRI; if (bev & (POLLOUT | POLLWRNORM)) /* * POLLWRNORM is equal to POLLOUT on FreeBSD, * but not on Linux */ bits |= LINUX_POLLOUT; if (bev & POLLERR) bits |= LINUX_POLLERR; if (bev & POLLHUP) bits |= LINUX_POLLHUP; if (bev & POLLNVAL) bits |= LINUX_POLLNVAL; if (bev & POLLRDNORM) bits |= LINUX_POLLRDNORM; if (bev & POLLRDBAND) bits |= LINUX_POLLRDBAND; if (bev & POLLWRBAND) bits |= LINUX_POLLWRBAND; if (bev & POLLRDHUP) bits |= LINUX_POLLRDHUP; *lev = bits; } diff --git a/sys/compat/linux/linux.h b/sys/compat/linux/linux.h index 9b75e6032445..15830b5b387e 100644 --- a/sys/compat/linux/linux.h +++ b/sys/compat/linux/linux.h @@ -1,267 +1,266 @@ /*- - * Copyright (c) 2015 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2015 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_MI_H_ #define _LINUX_MI_H_ #include /* * poll() */ #define LINUX_POLLIN 0x0001 #define LINUX_POLLPRI 0x0002 #define LINUX_POLLOUT 0x0004 #define LINUX_POLLERR 0x0008 #define LINUX_POLLHUP 0x0010 #define LINUX_POLLNVAL 0x0020 #define LINUX_POLLRDNORM 0x0040 #define LINUX_POLLRDBAND 0x0080 #define LINUX_POLLWRNORM 0x0100 #define LINUX_POLLWRBAND 0x0200 #define LINUX_POLLMSG 0x0400 #define LINUX_POLLREMOVE 0x1000 #define LINUX_POLLRDHUP 0x2000 #define LINUX_IFHWADDRLEN 6 #define LINUX_IFNAMSIZ 16 /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) #define IFP_IS_LOOP(ifp) (ifp->if_type == IFT_LOOP) struct l_sockaddr { unsigned short sa_family; char sa_data[14]; }; #define LINUX_ARPHRD_ETHER 1 #define LINUX_ARPHRD_LOOPBACK 772 /* * Supported address families */ #define LINUX_AF_UNSPEC 0 #define LINUX_AF_UNIX 1 #define LINUX_AF_INET 2 #define LINUX_AF_AX25 3 #define LINUX_AF_IPX 4 #define LINUX_AF_APPLETALK 5 #define LINUX_AF_INET6 10 #define LINUX_AF_NETLINK 16 #define LINUX_NETLINK_ROUTE 0 #define LINUX_NETLINK_SOCK_DIAG 4 #define LINUX_NETLINK_NFLOG 5 #define LINUX_NETLINK_SELINUX 7 #define LINUX_NETLINK_AUDIT 9 #define LINUX_NETLINK_FIB_LOOKUP 10 #define LINUX_NETLINK_NETFILTER 12 #define LINUX_NETLINK_KOBJECT_UEVENT 15 /* * net device flags */ #define LINUX_IFF_UP 0x0001 #define LINUX_IFF_BROADCAST 0x0002 #define LINUX_IFF_DEBUG 0x0004 #define LINUX_IFF_LOOPBACK 0x0008 #define LINUX_IFF_POINTOPOINT 0x0010 #define LINUX_IFF_NOTRAILERS 0x0020 #define LINUX_IFF_RUNNING 0x0040 #define LINUX_IFF_NOARP 0x0080 #define LINUX_IFF_PROMISC 0x0100 #define LINUX_IFF_ALLMULTI 0x0200 #define LINUX_IFF_MASTER 0x0400 #define LINUX_IFF_SLAVE 0x0800 #define LINUX_IFF_MULTICAST 0x1000 #define LINUX_IFF_PORTSEL 0x2000 #define LINUX_IFF_AUTOMEDIA 0x4000 #define LINUX_IFF_DYNAMIC 0x8000 /* sigaltstack */ #define LINUX_SS_ONSTACK 1 #define LINUX_SS_DISABLE 2 int linux_to_bsd_sigaltstack(int lsa); int bsd_to_linux_sigaltstack(int bsa); /* sigset */ typedef struct { uint64_t __mask; } l_sigset_t; /* primitives to manipulate sigset_t */ #define LINUX_SIGEMPTYSET(set) (set).__mask = 0 #define LINUX_SIGISMEMBER(set, sig) (1UL & ((set).__mask >> _SIG_IDX(sig))) #define LINUX_SIGADDSET(set, sig) (set).__mask |= 1UL << _SIG_IDX(sig) void linux_to_bsd_sigset(l_sigset_t *, sigset_t *); void bsd_to_linux_sigset(sigset_t *, l_sigset_t *); /* signaling */ #define LINUX_SIGHUP 1 #define LINUX_SIGINT 2 #define LINUX_SIGQUIT 3 #define LINUX_SIGILL 4 #define LINUX_SIGTRAP 5 #define LINUX_SIGABRT 6 #define LINUX_SIGIOT LINUX_SIGABRT #define LINUX_SIGBUS 7 #define LINUX_SIGFPE 8 #define LINUX_SIGKILL 9 #define LINUX_SIGUSR1 10 #define LINUX_SIGSEGV 11 #define LINUX_SIGUSR2 12 #define LINUX_SIGPIPE 13 #define LINUX_SIGALRM 14 #define LINUX_SIGTERM 15 #define LINUX_SIGSTKFLT 16 #define LINUX_SIGCHLD 17 #define LINUX_SIGCONT 18 #define LINUX_SIGSTOP 19 #define LINUX_SIGTSTP 20 #define LINUX_SIGTTIN 21 #define LINUX_SIGTTOU 22 #define LINUX_SIGURG 23 #define LINUX_SIGXCPU 24 #define LINUX_SIGXFSZ 25 #define LINUX_SIGVTALRM 26 #define LINUX_SIGPROF 27 #define LINUX_SIGWINCH 28 #define LINUX_SIGIO 29 #define LINUX_SIGPOLL LINUX_SIGIO #define LINUX_SIGPWR 30 #define LINUX_SIGSYS 31 #define LINUX_SIGTBLSZ 31 #define LINUX_SIGRTMIN 32 #define LINUX_SIGRTMAX 64 #define LINUX_SIG_VALID(sig) ((sig) <= LINUX_SIGRTMAX && (sig) > 0) int linux_to_bsd_signal(int sig); int bsd_to_linux_signal(int sig); extern LIST_HEAD(futex_list, futex) futex_list; extern struct mtx futex_mtx; void linux_dev_shm_create(void); void linux_dev_shm_destroy(void); /* * mask=0 is not sensible for this application, so it will be taken to mean * a mask equivalent to the value. Otherwise, (word & mask) == value maps to * (word & ~mask) | value in a bitfield for the platform we're converting to. */ struct bsd_to_linux_bitmap { int bsd_mask; int bsd_value; int linux_mask; int linux_value; }; int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value); int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value); /* * These functions are used for simplification of BSD <-> Linux bit conversions. * Given `value`, a bit field, these functions will walk the given bitmap table * and set the appropriate bits for the target platform. If any bits were * successfully converted, then the return value is the equivalent of value * represented with the bit values appropriate for the target platform. * Otherwise, the value supplied as `no_value` is returned. */ #define bsd_to_linux_bits(_val, _bmap, _noval) \ bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) #define linux_to_bsd_bits(_val, _bmap, _noval) \ linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) /* * Easy mapping helpers. BITMAP_EASY_LINUX represents a single bit to be * translated, and the FreeBSD and Linux values are supplied. BITMAP_1t1_LINUX * is the extreme version of this, where not only is it a single bit, but the * name of the macro used to represent the Linux version of a bit literally has * LINUX_ prepended to the normal name. */ #define BITMAP_EASY_LINUX(_name, _linux_name) \ { \ .bsd_value = (_name), \ .linux_value = (_linux_name), \ } #define BITMAP_1t1_LINUX(_name) BITMAP_EASY_LINUX(_name, LINUX_##_name) int bsd_to_linux_errno(int error); void linux_check_errtbl(void); #define STATX_BASIC_STATS 0x07ff #define STATX_BTIME 0x0800 #define STATX_ALL 0x0fff #define STATX_ATTR_COMPRESSED 0x0004 #define STATX_ATTR_IMMUTABLE 0x0010 #define STATX_ATTR_APPEND 0x0020 #define STATX_ATTR_NODUMP 0x0040 #define STATX_ATTR_ENCRYPTED 0x0800 #define STATX_ATTR_AUTOMOUNT 0x1000 struct l_statx_timestamp { int64_t tv_sec; int32_t tv_nsec; int32_t __spare0; }; struct l_statx { uint32_t stx_mask; uint32_t stx_blksize; uint64_t stx_attributes; uint32_t stx_nlink; uint32_t stx_uid; uint32_t stx_gid; uint16_t stx_mode; uint16_t __spare0[1]; uint64_t stx_ino; uint64_t stx_size; uint64_t stx_blocks; uint64_t stx_attributes_mask; struct l_statx_timestamp stx_atime; struct l_statx_timestamp stx_btime; struct l_statx_timestamp stx_ctime; struct l_statx_timestamp stx_mtime; uint32_t stx_rdev_major; uint32_t stx_rdev_minor; uint32_t stx_dev_major; uint32_t stx_dev_minor; uint64_t stx_mnt_id; uint64_t __spare2[13]; }; #endif /* _LINUX_MI_H_ */ diff --git a/sys/compat/linux/linux_common.h b/sys/compat/linux/linux_common.h index b0e3408e42df..c6837a6a3b52 100644 --- a/sys/compat/linux/linux_common.h +++ b/sys/compat/linux/linux_common.h @@ -1,49 +1,49 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2019 Dmitry Chagin + * Copyright (c) 2019 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_COMMON_H_ #define _LINUX_COMMON_H_ struct ifnet *ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname); void linux_ifflags(struct ifnet *ifp, short *flags); int linux_ifhwaddr(struct ifnet *ifp, struct l_sockaddr *lsa); int linux_to_bsd_domain(int domain); int bsd_to_linux_domain(int domain); int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len); int linux_to_bsd_sockaddr(const struct l_sockaddr *lsa, struct sockaddr **sap, socklen_t *len); void linux_to_bsd_poll_events(struct thread *td, int fd, short lev, short *bev); void bsd_to_linux_poll_events(short bev, short *lev); #endif /* _LINUX_COMMON_H_ */ diff --git a/sys/compat/linux/linux_dummy.c b/sys/compat/linux/linux_dummy.c index aa579d762c93..fbec69f4ac31 100644 --- a/sys/compat/linux/linux_dummy.c +++ b/sys/compat/linux/linux_dummy.c @@ -1,178 +1,177 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include /* * Including linux vs linux32 here is arbitrary -- the syscall args structures * (proto.h) are not dereferenced by the DUMMY stub implementations, and * suitable for use by both native and compat32 entrypoints. */ #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); UNIMPLEMENTED(afs_syscall); UNIMPLEMENTED(create_module); /* Added in Linux 1.0 removed in 2.6. */ UNIMPLEMENTED(epoll_ctl_old); UNIMPLEMENTED(epoll_wait_old); UNIMPLEMENTED(get_kernel_syms); /* Added in Linux 1.0 removed in 2.6. */ UNIMPLEMENTED(getpmsg); UNIMPLEMENTED(nfsservctl); /* Added in Linux 2.2 removed in 3.1. */ UNIMPLEMENTED(putpmsg); UNIMPLEMENTED(query_module); /* Added in Linux 2.2 removed in 2.6. */ UNIMPLEMENTED(security); UNIMPLEMENTED(vserver); DUMMY(setfsuid); DUMMY(setfsgid); DUMMY(vhangup); DUMMY(pivot_root); DUMMY(adjtimex); DUMMY(swapoff); DUMMY(init_module); DUMMY(delete_module); DUMMY(lookup_dcookie); DUMMY(remap_file_pages); DUMMY(mbind); DUMMY(get_mempolicy); DUMMY(set_mempolicy); DUMMY(kexec_load); /* Linux 2.6.11: */ DUMMY(add_key); DUMMY(request_key); DUMMY(keyctl); /* Linux 2.6.13: */ DUMMY(ioprio_set); DUMMY(ioprio_get); DUMMY(inotify_add_watch); DUMMY(inotify_rm_watch); /* Linux 2.6.16: */ DUMMY(migrate_pages); DUMMY(unshare); /* Linux 2.6.17: */ DUMMY(tee); DUMMY(vmsplice); /* Linux 2.6.18: */ DUMMY(move_pages); /* Linux 2.6.27: */ DUMMY(signalfd4); DUMMY(inotify_init1); /* Linux 2.6.31: */ DUMMY(perf_event_open); /* Linux 2.6.36: */ DUMMY(fanotify_init); DUMMY(fanotify_mark); /* Linux 2.6.39: */ DUMMY(clock_adjtime); /* Linux 3.0: */ DUMMY(setns); /* Linux 3.2: */ DUMMY(process_vm_readv); DUMMY(process_vm_writev); /* Linux 3.5: */ DUMMY(kcmp); /* Linux 3.8: */ DUMMY(finit_module); DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); /* Linux 3.19: */ DUMMY(execveat); /* Linux 4.2: */ DUMMY(userfaultfd); /* Linux 4.3: */ DUMMY(membarrier); /* Linux 4.4: */ DUMMY(mlock2); /* Linux 4.6: */ DUMMY(preadv2); DUMMY(pwritev2); /* Linux 4.8: */ DUMMY(pkey_mprotect); DUMMY(pkey_alloc); DUMMY(pkey_free); DUMMY(open_tree); DUMMY(move_mount); DUMMY(fsopen); DUMMY(fsconfig); DUMMY(fsmount); DUMMY(fspick); DUMMY(pidfd_open); DUMMY(clone3); DUMMY(close_range); DUMMY(openat2); DUMMY(pidfd_getfd); DUMMY(faccessat2); DUMMY(process_madvise); DUMMY(epoll_pwait2); DUMMY(mount_setattr); /* Linux 4.18: */ DUMMY(io_pgetevents); DUMMY(rseq); /* Linux 5.0: */ DUMMY(pidfd_send_signal); DUMMY(io_uring_setup); DUMMY(io_uring_enter); DUMMY(io_uring_register); #define DUMMY_XATTR(s) \ int \ linux_ ## s ## xattr( \ struct thread *td, struct linux_ ## s ## xattr_args *arg) \ { \ \ return (EOPNOTSUPP); \ } DUMMY_XATTR(set); DUMMY_XATTR(lset); DUMMY_XATTR(fset); DUMMY_XATTR(get); DUMMY_XATTR(lget); DUMMY_XATTR(fget); DUMMY_XATTR(list); DUMMY_XATTR(llist); DUMMY_XATTR(flist); DUMMY_XATTR(remove); DUMMY_XATTR(lremove); DUMMY_XATTR(fremove); diff --git a/sys/compat/linux/linux_emul.c b/sys/compat/linux/linux_emul.c index 499bebe8926a..fbfe9080cac7 100644 --- a/sys/compat/linux/linux_emul.c +++ b/sys/compat/linux/linux_emul.c @@ -1,383 +1,383 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * Copyright (c) 2006 Roman Divacky - * Copyright (c) 2013 Dmitry Chagin * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if BYTE_ORDER == LITTLE_ENDIAN #define SHELLMAGIC 0x2123 /* #! */ #else #define SHELLMAGIC 0x2321 #endif /* * This returns reference to the thread emuldata entry (if found) * * Hold PROC_LOCK when referencing emuldata from other threads. */ struct linux_emuldata * em_find(struct thread *td) { struct linux_emuldata *em; em = td->td_emuldata; return (em); } /* * This returns reference to the proc pemuldata entry (if found) * * Hold PROC_LOCK when referencing proc pemuldata from other threads. * Hold LINUX_PEM_LOCK wher referencing pemuldata members. */ struct linux_pemuldata * pem_find(struct proc *p) { struct linux_pemuldata *pem; pem = p->p_emuldata; return (pem); } /* * Linux apps generally expect the soft open file limit to be set * to 1024, often iterating over all the file descriptors up to that * limit instead of using closefrom(2). Give them what they want, * unless there already is a resource limit in place. */ static void linux_set_default_openfiles(struct thread *td, struct proc *p) { struct rlimit rlim; int error; if (linux_default_openfiles < 0) return; PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_NOFILE, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur != rlim.rlim_max || rlim.rlim_cur <= linux_default_openfiles) return; rlim.rlim_cur = linux_default_openfiles; error = kern_proc_setrlimit(td, p, RLIMIT_NOFILE, &rlim); KASSERT(error == 0, ("kern_proc_setrlimit failed")); } /* * The default stack size limit in Linux is 8MB. */ static void linux_set_default_stacksize(struct thread *td, struct proc *p) { struct rlimit rlim; int error; if (linux_default_stacksize < 0) return; PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur != rlim.rlim_max || rlim.rlim_cur <= linux_default_stacksize) return; rlim.rlim_cur = linux_default_stacksize; error = kern_proc_setrlimit(td, p, RLIMIT_STACK, &rlim); KASSERT(error == 0, ("kern_proc_setrlimit failed")); } void linux_proc_init(struct thread *td, struct thread *newtd, int flags) { struct linux_emuldata *em; struct linux_pemuldata *pem; struct proc *p; if (newtd != NULL) { p = newtd->td_proc; /* non-exec call */ em = malloc(sizeof(*em), M_TEMP, M_WAITOK | M_ZERO); if (flags & LINUX_CLONE_THREAD) { LINUX_CTR1(proc_init, "thread newtd(%d)", newtd->td_tid); em->em_tid = newtd->td_tid; } else { LINUX_CTR1(proc_init, "fork newtd(%d)", p->p_pid); em->em_tid = p->p_pid; pem = malloc(sizeof(*pem), M_LINUX, M_WAITOK | M_ZERO); sx_init(&pem->pem_sx, "lpemlk"); p->p_emuldata = pem; } newtd->td_emuldata = em; linux_set_default_openfiles(td, p); linux_set_default_stacksize(td, p); } else { p = td->td_proc; /* exec */ LINUX_CTR1(proc_init, "exec newtd(%d)", p->p_pid); /* lookup the old one */ em = em_find(td); KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n")); em->em_tid = p->p_pid; em->flags = 0; em->robust_futexes = NULL; em->child_clear_tid = NULL; em->child_set_tid = NULL; pem = pem_find(p); KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n")); pem->persona = 0; } } void linux_on_exit(struct proc *p) { struct linux_pemuldata *pem; struct thread *td = curthread; MPASS(SV_CURPROC_ABI() == SV_ABI_LINUX); LINUX_CTR3(proc_exit, "thread(%d) proc(%d) p %p", td->td_tid, p->p_pid, p); pem = pem_find(p); if (pem == NULL) return; (p->p_sysent->sv_thread_detach)(td); p->p_emuldata = NULL; sx_destroy(&pem->pem_sx); free(pem, M_LINUX); } /* * If a Linux binary is exec'ing something, try this image activator * first. We override standard shell script execution in order to * be able to modify the interpreter path. We only do this if a Linux * binary is doing the exec, so we do not create an EXEC module for it. */ int linux_exec_imgact_try(struct image_params *imgp) { const char *head = (const char *)imgp->image_header; char *rpath; int error = -1; /* * The interpreter for shell scripts run from a Linux binary needs * to be located in /compat/linux if possible in order to recursively * maintain Linux path emulation. */ if (((const short *)head)[0] == SHELLMAGIC) { /* * Run our normal shell image activator. If it succeeds attempt * to use the alternate path for the interpreter. If an * alternate path is found, use our stringspace to store it. */ if ((error = exec_shell_imgact(imgp)) == 0) { linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD); if (rpath != NULL) imgp->args->fname_buf = imgp->interpreter_name = rpath; } } return (error); } int linux_common_execve(struct thread *td, struct image_args *eargs) { struct linux_pemuldata *pem; struct vmspace *oldvmspace; struct linux_emuldata *em; struct proc *p; int error; p = td->td_proc; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = kern_execve(td, eargs, NULL, oldvmspace); post_execve(td, error, oldvmspace); if (error != EJUSTRETURN) return (error); /* * In a case of transition from Linux binary execing to * FreeBSD binary we destroy Linux emuldata thread & proc entries. */ if (SV_CURPROC_ABI() != SV_ABI_LINUX) { PROC_LOCK(p); em = em_find(td); KASSERT(em != NULL, ("proc_exec: thread emuldata not found.\n")); td->td_emuldata = NULL; pem = pem_find(p); KASSERT(pem != NULL, ("proc_exec: proc pemuldata not found.\n")); p->p_emuldata = NULL; PROC_UNLOCK(p); free(em, M_TEMP); free(pem, M_LINUX); } return (EJUSTRETURN); } void linux_on_exec(struct proc *p, struct image_params *imgp) { struct thread *td; struct thread *othertd; #if defined(__amd64__) struct linux_pemuldata *pem; #endif td = curthread; MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX); /* * When execing to Linux binary, we create Linux emuldata * thread entry. */ if (SV_PROC_ABI(p) == SV_ABI_LINUX) { /* * Process already was under Linuxolator * before exec. Update emuldata to reflect * single-threaded cleaned state after exec. */ linux_proc_init(td, NULL, 0); } else { /* * We are switching the process to Linux emulator. */ linux_proc_init(td, td, 0); /* * Create a transient td_emuldata for all suspended * threads, so that p->p_sysent->sv_thread_detach() == * linux_thread_detach() can find expected but unused * emuldata. */ FOREACH_THREAD_IN_PROC(td->td_proc, othertd) { if (othertd == td) continue; linux_proc_init(td, othertd, LINUX_CLONE_THREAD); } } #if defined(__amd64__) /* * An IA32 executable which has executable stack will have the * READ_IMPLIES_EXEC personality flag set automatically. */ if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && imgp->stack_prot & VM_PROT_EXECUTE) { pem = pem_find(p); pem->persona |= LINUX_READ_IMPLIES_EXEC; } #endif } void linux_thread_dtor(struct thread *td) { struct linux_emuldata *em; em = em_find(td); if (em == NULL) return; td->td_emuldata = NULL; LINUX_CTR1(thread_dtor, "thread(%d)", em->em_tid); free(em, M_TEMP); } void linux_schedtail(struct thread *td) { struct linux_emuldata *em; struct proc *p; int error = 0; int *child_set_tid; p = td->td_proc; em = em_find(td); KASSERT(em != NULL, ("linux_schedtail: thread emuldata not found.\n")); child_set_tid = em->child_set_tid; if (child_set_tid != NULL) { error = copyout(&em->em_tid, child_set_tid, sizeof(em->em_tid)); LINUX_CTR4(schedtail, "thread(%d) %p stored %d error %d", td->td_tid, child_set_tid, em->em_tid, error); } else LINUX_CTR1(schedtail, "thread(%d)", em->em_tid); } diff --git a/sys/compat/linux/linux_emul.h b/sys/compat/linux/linux_emul.h index d5f37c918b6c..ed5332b2c96c 100644 --- a/sys/compat/linux/linux_emul.h +++ b/sys/compat/linux/linux_emul.h @@ -1,88 +1,88 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 Roman Divacky - * Copyright (c) 2013 Dmitry Chagin * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_EMUL_H_ #define _LINUX_EMUL_H_ struct image_params; struct note_info_list; /* Linux core notes are labeled "CORE" */ #define LINUX_ABI_VENDOR "CORE" /* * modeled after similar structure in NetBSD * this will be extended as we need more functionality */ struct linux_emuldata { int *child_set_tid; /* in clone(): Child's TID to set on clone */ int *child_clear_tid;/* in clone(): Child's TID to clear on exit */ int flags; /* thread emuldata flags */ int em_tid; /* thread id */ struct linux_robust_list_head *robust_futexes; }; struct linux_emuldata *em_find(struct thread *); int linux_exec_imgact_try(struct image_params *); void linux_proc_init(struct thread *, struct thread *, int); void linux_on_exit(struct proc *); void linux_schedtail(struct thread *); void linux_on_exec(struct proc *, struct image_params *); void linux_thread_dtor(struct thread *); void linux_thread_detach(struct thread *); int linux_common_execve(struct thread *, struct image_args *); void linux32_prepare_notes(struct thread *, struct note_info_list *, size_t *); void linux64_prepare_notes(struct thread *, struct note_info_list *, size_t *); /* process emuldata flags */ #define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated futex REQUEUE op*/ #define LINUX_XUNSUP_EPOLL 0x00000002 /* unsupported epoll events */ #define LINUX_XUNSUP_FUTEXPIOP 0x00000004 /* uses unsupported pi futex */ struct linux_pemuldata { uint32_t flags; /* process emuldata flags */ struct sx pem_sx; /* lock for this struct */ uint32_t persona; /* process execution domain */ uint32_t ptrace_flags; /* used by ptrace(2) */ }; #define LINUX_PEM_XLOCK(p) sx_xlock(&(p)->pem_sx) #define LINUX_PEM_XUNLOCK(p) sx_xunlock(&(p)->pem_sx) #define LINUX_PEM_SLOCK(p) sx_slock(&(p)->pem_sx) #define LINUX_PEM_SUNLOCK(p) sx_sunlock(&(p)->pem_sx) struct linux_pemuldata *pem_find(struct proc *); #endif /* !_LINUX_EMUL_H_ */ diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c index 991af1b539e4..9860acd8900d 100644 --- a/sys/compat/linux/linux_event.c +++ b/sys/compat/linux/linux_event.c @@ -1,981 +1,981 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Roman Divacky - * Copyright (c) 2014 Dmitry Chagin * All rights reserved. + * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include typedef uint64_t epoll_udata_t; struct epoll_event { uint32_t events; epoll_udata_t data; } #if defined(__amd64__) __attribute__((packed)) #endif ; #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents); static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags); static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd); static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd); struct epoll_copyin_args { struct kevent *changelist; }; struct epoll_copyout_args { struct epoll_event *leventlist; struct proc *p; uint32_t count; int error; }; /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; static fo_ioctl_t timerfd_ioctl; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; static fo_close_t timerfd_close; static fo_fill_kinfo_t timerfd_fill_kinfo; static struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = timerfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_timerfddetach(struct knote *kn); static int filt_timerfdread(struct knote *kn, long hint); static struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread }; struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; struct callout tfd_callout; timerfd_t tfd_count; bool tfd_canceled; struct selinfo tfd_sel; struct mtx tfd_lock; }; static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); static int epoll_create_common(struct thread *td, int flags) { return (kern_kqueue(td, flags, NULL)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) { /* * args->size is unused. Linux just tests it * and then forgets it as well. */ if (args->size <= 0) return (EINVAL); return (epoll_create_common(td, 0)); } #endif int linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) { int flags; if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; return (epoll_create_common(td, flags)); } /* Structure converting function from epoll to kevent. */ static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents) { uint32_t levents = l_event->events; struct linux_pemuldata *pem; struct proc *p; unsigned short kev_flags = EV_ADD | EV_ENABLE; /* flags related to how event is registered */ if ((levents & LINUX_EPOLLONESHOT) != 0) kev_flags |= EV_DISPATCH; if ((levents & LINUX_EPOLLET) != 0) kev_flags |= EV_CLEAR; if ((levents & LINUX_EPOLLERR) != 0) kev_flags |= EV_ERROR; if ((levents & LINUX_EPOLLRDHUP) != 0) kev_flags |= EV_EOF; /* flags related to what event is registered */ if ((levents & LINUX_EPOLL_EVRD) != 0) { EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0); kevent->ext[0] = l_event->data; ++kevent; ++(*nkevents); } if ((levents & LINUX_EPOLL_EVWR) != 0) { EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); kevent->ext[0] = l_event->data; ++kevent; ++(*nkevents); } /* zero event mask is legal */ if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); ++(*nkevents); } if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_XLOCK(pem); if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { pem->flags |= LINUX_XUNSUP_EPOLL; LINUX_PEM_XUNLOCK(pem); linux_msg(td, "epoll_ctl unsupported flags: 0x%x", levents); } else LINUX_PEM_XUNLOCK(pem); return (EINVAL); } return (0); } /* * Structure converting function from kevent to epoll. In a case * this is called on error in registration we store the error in * event->data and pick it up later in linux_epoll_ctl(). */ static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) { l_event->data = kevent->ext[0]; if ((kevent->flags & EV_ERROR) != 0) { l_event->events = LINUX_EPOLLERR; return; } /* XXX EPOLLPRI, EPOLLHUP */ switch (kevent->filter) { case EVFILT_READ: l_event->events = LINUX_EPOLLIN; if ((kevent->flags & EV_EOF) != 0) l_event->events |= LINUX_EPOLLRDHUP; break; case EVFILT_WRITE: l_event->events = LINUX_EPOLLOUT; break; } } /* * Copyout callback used by kevent. This converts kevent * events to epoll events and copies them back to the * userspace. This is also called on error on registering * of the filter. */ static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count) { struct epoll_copyout_args *args; struct epoll_event *eep; int error, i; args = (struct epoll_copyout_args*) arg; eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); for (i = 0; i < count; i++) kevent_to_epoll(&kevp[i], &eep[i]); error = copyout(eep, args->leventlist, count * sizeof(*eep)); if (error == 0) { args->leventlist += count; args->count += count; } else if (args->error == 0) args->error = error; free(eep, M_EPOLL); return (error); } /* * Copyin callback used by kevent. This copies already * converted filters from kernel memory to the kevent * internal kernel memory. Hence the memcpy instead of * copyin. */ static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count) { struct epoll_copyin_args *args; args = (struct epoll_copyin_args*) arg; memcpy(kevp, args->changelist, count * sizeof(*kevp)); args->changelist += count; return (0); } /* * Load epoll filter, convert it to kevent filter * and load it into kevent subsystem. */ int linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) { struct file *epfp, *fp; struct epoll_copyin_args ciargs; struct kevent kev[2]; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; struct epoll_event le; cap_rights_t rights; int nchanges = 0; int error; if (args->op != LINUX_EPOLL_CTL_DEL) { error = copyin(args->event, &le, sizeof(le)); if (error != 0) return (error); } error = fget(td, args->epfd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } /* Protect user data vector from incorrectly supplied fd. */ error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp); if (error != 0) goto leave1; /* Linux disallows spying on himself */ if (epfp == fp) { error = EINVAL; goto leave0; } ciargs.changelist = kev; if (args->op != LINUX_EPOLL_CTL_DEL) { error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); if (error != 0) goto leave0; } switch (args->op) { case LINUX_EPOLL_CTL_MOD: error = epoll_delete_all_events(td, epfp, args->fd); if (error != 0) goto leave0; break; case LINUX_EPOLL_CTL_ADD: if (epoll_fd_registered(td, epfp, args->fd)) { error = EEXIST; goto leave0; } break; case LINUX_EPOLL_CTL_DEL: /* CTL_DEL means unregister this fd with this epoll */ error = epoll_delete_all_events(td, epfp, args->fd); goto leave0; default: error = EINVAL; goto leave0; } error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); leave0: fdrop(fp, td); leave1: fdrop(epfp, td); return (error); } /* * Wait for a filter to be triggered on the epoll file descriptor. */ static int linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, int maxevents, int timeout, sigset_t *uset) { struct epoll_copyout_args coargs; struct kevent_copyops k_ops = { &coargs, epoll_kev_copyout, NULL}; struct timespec ts, *tsp; cap_rights_t rights; struct file *epfp; sigset_t omask; int error; if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) return (EINVAL); error = fget(td, epfd, cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave; } if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &omask, 0); if (error != 0) goto leave; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } coargs.leventlist = events; coargs.p = td->td_proc; coargs.count = 0; coargs.error = 0; /* * Linux epoll_wait(2) man page states that timeout of -1 causes caller * to block indefinitely. Real implementation does it if any negative * timeout value is passed. */ if (timeout >= 0) { /* Convert from milliseconds to timespec. */ ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; tsp = &ts; } else { tsp = NULL; } error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); if (error == 0 && coargs.error != 0) error = coargs.error; /* * kern_kevent might return ENOMEM which is not expected from epoll_wait. * Maybe we should translate that but I don't think it matters at all. */ if (error == 0) td->td_retval[0] = coargs.count; if (uset != NULL) error = kern_sigprocmask(td, SIG_SETMASK, &omask, NULL, 0); leave: fdrop(epfp, td); return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) { return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, NULL)); } #endif int linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) { sigset_t mask, *pmask; l_sigset_t lmask; int error; if (args->mask != NULL) { if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); if (error != 0) return (error); linux_to_bsd_sigset(&lmask, &mask); pmask = &mask; } else pmask = NULL; return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, pmask)); } static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags) { struct epoll_copyin_args ciargs; struct kevent kev; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; ciargs.changelist = &kev; EV_SET(&kev, fd, filter, flags, 0, 0, 0); return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); } static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd) { /* * Set empty filter flags to avoid accidental modification of already * registered events. In the case of event re-registration: * 1. If event does not exists kevent() does nothing and returns ENOENT * 2. If event does exists, it's enabled/disabled state is preserved * but fflags, data and udata fields are overwritten. So we can not * set socket lowats and store user's context pointer in udata. */ if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) return (1); return (0); } static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) { int error1, error2; error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); /* return 0 if at least one result positive */ return (error1 == 0 ? 0 : error2); } #ifdef LINUX_LEGACY_SYSCALLS int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) { struct specialfd_eventfd ae; bzero(&ae, sizeof(ae)); ae.initval = args->initval; return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); } #endif int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { struct specialfd_eventfd ae; int flags; if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK | LINUX_EFD_SEMAPHORE)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= EFD_CLOEXEC; if ((args->flags & LINUX_O_NONBLOCK) != 0) flags |= EFD_NONBLOCK; if ((args->flags & LINUX_EFD_SEMAPHORE) != 0) flags |= EFD_SEMAPHORE; bzero(&ae, sizeof(ae)); ae.flags = flags; ae.initval = args->initval; return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); } int linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) { struct filedesc *fdp; struct timerfd *tfd; struct file *fp; clockid_t clockid; int fflags, fd, error; if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) return (EINVAL); error = linux_to_native_clockid(&clockid, args->clockid); if (error != 0) return (error); if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return (EINVAL); fflags = 0; if ((args->flags & LINUX_TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); tfd->tfd_clockid = clockid; mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); fflags = FREAD; if ((args->flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); timespecclear(&tfd->tfd_time.it_value); timespecclear(&tfd->tfd_time.it_interval); callout_drain(&tfd->tfd_callout); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&tfd->tfd_lock); free(tfd, M_EPOLL); return (0); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd; timerfd_t count; int error; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); error = 0; mtx_lock(&tfd->tfd_lock); retry: if (tfd->tfd_canceled) { tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (ECANCELED); } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); if (error == 0) goto retry; } if (error == 0) { count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); } else mtx_unlock(&tfd->tfd_lock); return (error); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd; int revents = 0; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (POLLERR); mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (kn->kn_filter == EVFILT_READ) kn->kn_fop = &timerfd_rfiltops; else return (EINVAL); kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; return (tfd->tfd_count > 0); } static int timerfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD) return (EINVAL); switch (cmd) { case FIONBIO: case FIOASYNC: return (0); } return (ENOTTY); } static int timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } static void linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) { if (tfd->tfd_clockid == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } static void linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) { struct timespec cts; linux_timerfd_clocktime(tfd, &cts); *ots = tfd->tfd_time; if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { timespecsub(&ots->it_value, &cts, &ots->it_value); if (ots->it_value.tv_sec < 0 || (ots->it_value.tv_sec == 0 && ots->it_value.tv_nsec == 0)) { ots->it_value.tv_sec = 0; ots->it_value.tv_nsec = 1; } } } int linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) { struct l_itimerspec lots; struct itimerspec ots; struct timerfd *tfd; struct file *fp; int error; error = fget(td, args->fd, &cap_read_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); linux_timerfd_curval(tfd, &ots); mtx_unlock(&tfd->tfd_lock); error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); out: fdrop(fp, td); return (error); } int linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) { struct l_itimerspec lots; struct itimerspec nts, ots; struct timespec cts, ts; struct timerfd *tfd; struct timeval tv; struct file *fp; int error; if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) return (EINVAL); error = copyin(args->new_value, &lots, sizeof(lots)); if (error != 0) return (error); error = linux_to_native_itimerspec(&nts, &lots); if (error != 0) return (error); error = fget(td, args->fd, &cap_write_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); if (!timespecisset(&nts.it_value)) timespecclear(&nts.it_interval); if (args->old_value != NULL) linux_timerfd_curval(tfd, &ots); tfd->tfd_time = nts; tfd->tfd_count = 0; if (timespecisset(&nts.it_value)) { linux_timerfd_clocktime(tfd, &cts); ts = nts.it_value; if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { timespecadd(&tfd->tfd_time.it_value, &cts, &tfd->tfd_time.it_value); } else { timespecsub(&ts, &cts, &ts); } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); tfd->tfd_canceled = false; } else { tfd->tfd_canceled = true; callout_stop(&tfd->tfd_callout); } mtx_unlock(&tfd->tfd_lock); if (args->old_value != NULL) { error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); } out: fdrop(fp, td); return (error); } static void linux_timerfd_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct timerfd *tfd; tfd = (struct timerfd *)arg; linux_timerfd_clocktime(tfd, &cts); if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { if (timespecisset(&tfd->tfd_time.it_interval)) timespecadd(&tfd->tfd_time.it_value, &tfd->tfd_time.it_interval, &tfd->tfd_time.it_value); else /* single shot timer */ timespecclear(&tfd->tfd_time.it_value); if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } tfd->tfd_count++; KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); selwakeup(&tfd->tfd_sel); wakeup(&tfd->tfd_count); } else if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } } diff --git a/sys/compat/linux/linux_event.h b/sys/compat/linux/linux_event.h index d6a0c2d42d0c..27b25b4e28e3 100644 --- a/sys/compat/linux/linux_event.h +++ b/sys/compat/linux/linux_event.h @@ -1,71 +1,71 @@ /*- * Copyright (c) 2007 Roman Divacky - * Copyright (c) 2014 Dmitry Chagin * All rights reserved. + * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_EVENT_H_ #define _LINUX_EVENT_H_ #define LINUX_EPOLLIN 0x001 #define LINUX_EPOLLPRI 0x002 #define LINUX_EPOLLOUT 0x004 #define LINUX_EPOLLRDNORM 0x040 #define LINUX_EPOLLRDBAND 0x080 #define LINUX_EPOLLWRNORM 0x100 #define LINUX_EPOLLWRBAND 0x200 #define LINUX_EPOLLMSG 0x400 #define LINUX_EPOLLERR 0x008 #define LINUX_EPOLLHUP 0x010 #define LINUX_EPOLLRDHUP 0x2000 #define LINUX_EPOLLWAKEUP 1u<<29 #define LINUX_EPOLLONESHOT 1u<<30 #define LINUX_EPOLLET 1u<<31 #define LINUX_EPOLL_EVRD (LINUX_EPOLLIN|LINUX_EPOLLRDNORM) #define LINUX_EPOLL_EVWR (LINUX_EPOLLOUT|LINUX_EPOLLWRNORM) #define LINUX_EPOLL_EVSUP (LINUX_EPOLLET|LINUX_EPOLLONESHOT \ |LINUX_EPOLLHUP|LINUX_EPOLLERR|LINUX_EPOLLPRI \ |LINUX_EPOLL_EVRD|LINUX_EPOLL_EVWR|LINUX_EPOLLRDHUP) #define LINUX_EPOLL_CTL_ADD 1 #define LINUX_EPOLL_CTL_DEL 2 #define LINUX_EPOLL_CTL_MOD 3 #define LINUX_EFD_SEMAPHORE (1 << 0) #define LINUX_TFD_TIMER_ABSTIME (1 << 0) #define LINUX_TFD_TIMER_CANCEL_ON_SET (1 << 1) #define LINUX_TFD_CLOEXEC LINUX_O_CLOEXEC #define LINUX_TFD_NONBLOCK LINUX_O_NONBLOCK #define LINUX_TFD_SHARED_FCNTL_FLAGS (LINUX_TFD_CLOEXEC \ |LINUX_TFD_NONBLOCK) #define LINUX_TFD_CREATE_FLAGS LINUX_TFD_SHARED_FCNTL_FLAGS #define LINUX_TFD_SETTIME_FLAGS (LINUX_TFD_TIMER_ABSTIME \ |LINUX_TFD_TIMER_CANCEL_ON_SET) #endif /* !_LINUX_EVENT_H_ */ diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 6d4c31a8a6df..d1fd7725650d 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -1,1217 +1,1217 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * - * Copyright (c) 2009-2016 Dmitry Chagin * Copyright (c) 2005 Emmanuel Dreyfus * All rights reserved. + * Copyright (c) 2009-2016 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * Futex part for the special DTrace module "locks". */ LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); /** * Per futex probes. */ LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", "struct waiting_proc *", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_clockswitch); LIN_SDT_PROBE_DEFINE1(futex, linux_futex, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, linux_futex, debug_wait_value_neq, "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE2(futex, linux_futex, debug_cmp_requeue_value_neq, "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); struct futex; struct waiting_proc { uint32_t wp_flags; struct futex *wp_futex; TAILQ_ENTRY(waiting_proc) wp_list; }; struct futex { struct mtx f_lck; uint32_t *f_uaddr; /* user-supplied value, for debug */ struct umtx_key f_key; uint32_t f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; }; #define FUTEX_LOCK(f) mtx_lock(&(f)->f_lck) #define FUTEX_LOCKED(f) mtx_owned(&(f)->f_lck) #define FUTEX_UNLOCK(f) mtx_unlock(&(f)->f_lck) #define FUTEX_INIT(f) do { \ mtx_init(&(f)->f_lck, "ftlk", NULL, \ MTX_DUPOK); \ LIN_SDT_PROBE1(futex, futex, create, \ &(f)->f_lck); \ } while (0) #define FUTEX_DESTROY(f) do { \ LIN_SDT_PROBE1(futex, futex, destroy, \ &(f)->f_lck); \ mtx_destroy(&(f)->f_lck); \ } while (0) #define FUTEX_ASSERT_LOCKED(f) mtx_assert(&(f)->f_lck, MA_OWNED) #define FUTEX_ASSERT_UNLOCKED(f) mtx_assert(&(f)->f_lck, MA_NOTOWNED) #define FUTEXES_LOCK do { \ mtx_lock(&futex_mtx); \ LIN_SDT_PROBE1(locks, futex_mtx, \ locked, &futex_mtx); \ } while (0) #define FUTEXES_UNLOCK do { \ LIN_SDT_PROBE1(locks, futex_mtx, \ unlock, &futex_mtx); \ mtx_unlock(&futex_mtx); \ } while (0) /* flags for futex_get() */ #define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ #define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ #define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ #define FUTEX_SHARED 0x8 /* shared futex */ #define FUTEX_DONTLOCK 0x10 /* don't lock futex */ /* wp_flags */ #define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list * of futex where thread sleep to wp_list * of another futex. */ #define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex * wp_list to prevent double wakeup. */ static void futex_put(struct futex *, struct waiting_proc *); static int futex_get0(uint32_t *, struct futex **f, uint32_t); static int futex_get(uint32_t *, struct waiting_proc **, struct futex **, uint32_t); static int futex_sleep(struct futex *, struct waiting_proc *, struct timespec *); static int futex_wake(struct futex *, int, uint32_t); static int futex_requeue(struct futex *, int, struct futex *, int); static int futex_wait(struct futex *, struct waiting_proc *, struct timespec *, uint32_t); static void futex_lock(struct futex *); static void futex_unlock(struct futex *); static int futex_atomic_op(struct thread *, int, uint32_t *); static int handle_futex_death(struct linux_emuldata *, uint32_t *, unsigned int); static int fetch_robust_entry(struct linux_robust_list **, struct linux_robust_list **, unsigned int *); struct linux_futex_args { uint32_t *uaddr; int32_t op; uint32_t val; struct timespec *ts; uint32_t *uaddr2; uint32_t val3; struct timespec kts; }; static int linux_futex(struct thread *, struct linux_futex_args *); static void futex_put(struct futex *f, struct waiting_proc *wp) { if (wp != NULL) { if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); free(wp, M_FUTEX_WP); } FUTEXES_LOCK; if (--f->f_refcount == 0) { LIST_REMOVE(f, f_list); FUTEXES_UNLOCK; if (FUTEX_LOCKED(f)) futex_unlock(f); LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); umtx_key_release(&f->f_key); FUTEX_DESTROY(f); free(f, M_FUTEX); return; } LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); if (FUTEX_LOCKED(f)) futex_unlock(f); FUTEXES_UNLOCK; } static int futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) { struct futex *f, *tmpf; struct umtx_key key; int error; *newf = tmpf = NULL; error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE, &key); if (error) { LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); return (error); } retry: FUTEXES_LOCK; LIST_FOREACH(f, &futex_list, f_list) { if (umtx_key_match(&f->f_key, &key)) { if (tmpf != NULL) { if (FUTEX_LOCKED(tmpf)) futex_unlock(tmpf); FUTEX_DESTROY(tmpf); free(tmpf, M_FUTEX); } if (flags & FUTEX_DONTEXISTS) { FUTEXES_UNLOCK; umtx_key_release(&key); return (EINVAL); } /* * Increment refcount of the found futex to * prevent it from deallocation before FUTEX_LOCK() */ ++f->f_refcount; FUTEXES_UNLOCK; umtx_key_release(&key); if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(f); *newf = f; LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", uaddr, f->f_refcount, f->f_key.shared); return (0); } } if (flags & FUTEX_DONTCREATE) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); return (0); } if (tmpf == NULL) { FUTEXES_UNLOCK; tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); tmpf->f_uaddr = uaddr; tmpf->f_key = key; tmpf->f_refcount = 1; tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; FUTEX_INIT(tmpf); TAILQ_INIT(&tmpf->f_waiting_proc); /* * Lock the new futex before an insert into the futex_list * to prevent futex usage by other. */ if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(tmpf); goto retry; } LIST_INSERT_HEAD(&futex_list, tmpf, f_list); FUTEXES_UNLOCK; LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, tmpf->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", uaddr, tmpf->f_refcount, tmpf->f_key.shared); *newf = tmpf; return (0); } static int futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, uint32_t flags) { int error; if (flags & FUTEX_CREATE_WP) { *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); (*wp)->wp_flags = 0; } error = futex_get0(uaddr, f, flags); if (error) { LIN_SDT_PROBE0(futex, futex_get, error); if (flags & FUTEX_CREATE_WP) free(*wp, M_FUTEX_WP); return (error); } if (flags & FUTEX_CREATE_WP) { TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); (*wp)->wp_futex = *f; } return (error); } static inline void futex_lock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_lock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_UNLOCKED(f); FUTEX_LOCK(f); } static inline void futex_unlock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_unlock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_LOCKED(f); FUTEX_UNLOCK(f); } static int futex_sleep(struct futex *f, struct waiting_proc *wp, struct timespec *ts) { sbintime_t sbt, prec, tmp; time_t over; int error; FUTEX_ASSERT_LOCKED(f); if (ts != NULL) { if (ts->tv_sec > INT32_MAX / 2) { over = ts->tv_sec - INT32_MAX / 2; ts->tv_sec -= over; } tmp = tstosbt(*ts); if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; prec = tmp; prec >>= tc_precexp; } else { sbt = 0; prec = 0; } LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %ld ref %d", f->f_uaddr, wp, sbt, f->f_refcount); error = msleep_sbt(wp, &f->f_lck, PCATCH, "futex", sbt, prec, C_ABSOLUTE); if (wp->wp_flags & FUTEX_WP_REQUEUED) { KASSERT(f != wp->wp_futex, ("futex != wp_futex")); if (error) { LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); } LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" " %p requeued uaddr %p ref %d", error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); futex_put(f, NULL); f = wp->wp_futex; futex_lock(f); } else { if (error) { LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, f->f_uaddr, wp); } LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", error, f->f_uaddr, wp); } futex_put(f, wp); return (error); } static int futex_wake(struct futex *f, int n, uint32_t bitset) { struct waiting_proc *wp, *wpt; int count = 0; if (bitset == 0) return (EINVAL); FUTEX_ASSERT_LOCKED(f); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, f->f_refcount); LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", f->f_uaddr, wp, f->f_refcount); /* * Unless we find a matching bit in * the bitset, continue searching. */ if (!(wp->wp_futex->f_bitset & bitset)) continue; wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); wakeup_one(wp); if (++count == n) break; } return (count); } static int futex_requeue(struct futex *f, int nrwake, struct futex *f2, int nrrequeue) { struct waiting_proc *wp, *wpt; int count = 0; FUTEX_ASSERT_LOCKED(f); FUTEX_ASSERT_LOCKED(f2); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { if (++count <= nrwake) { LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", f->f_uaddr, wp); wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); wakeup_one(wp); } else { LIN_SDT_PROBE3(futex, futex_requeue, requeue, f->f_uaddr, wp, f2->f_uaddr); LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", f->f_uaddr, wp, f2->f_uaddr); wp->wp_flags |= FUTEX_WP_REQUEUED; /* Move wp to wp_list of f2 futex */ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); /* * Thread which sleeps on wp after waking should * acquire f2 lock, so increment refcount of f2 to * prevent it from premature deallocation. */ wp->wp_futex = f2; FUTEXES_LOCK; ++f2->f_refcount; FUTEXES_UNLOCK; if (count - nrwake >= nrrequeue) break; } } return (count); } static int futex_wait(struct futex *f, struct waiting_proc *wp, struct timespec *ts, uint32_t bitset) { int error; if (bitset == 0) { futex_put(f, wp); return (EINVAL); } f->f_bitset = bitset; error = futex_sleep(f, wp, ts); if (error) LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); if (error == EWOULDBLOCK) error = ETIMEDOUT; return (error); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); /* XXX: Linux verifies access here and returns EFAULT */ LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) return (ret); switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } return (ret); } static int linux_futex(struct thread *td, struct linux_futex_args *args) { int clockrt, nrwake, nrrequeue, op_ret, ret; struct linux_pemuldata *pem; struct waiting_proc *wp; struct futex *f, *f2; struct timespec kts; int error, save; uint32_t flags, val; if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else flags = FUTEX_SHARED; /* * Currently support for switching between CLOCK_MONOTONIC and * CLOCK_REALTIME is not present. However Linux forbids the use of * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and * FUTEX_WAIT_REQUEUE_PI. */ clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { LIN_SDT_PROBE0(futex, linux_futex, unimplemented_clockswitch); return (ENOSYS); } error = 0; f = f2 = NULL; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); if (args->ts != NULL) { if (clockrt) { nanotime(&kts); timespecsub(args->ts, &kts, args->ts); } else if (args->op == LINUX_FUTEX_WAIT_BITSET) { nanouptime(&kts); timespecsub(args->ts, &kts, args->ts); } } retry0: error = futex_get(args->uaddr, &wp, &f, flags | FUTEX_CREATE_WP); if (error) return (error); error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f, wp); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry0; LIN_SDT_PROBE1(futex, linux_futex, copyin_error, error); LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); return (error); } if (val != args->val) { LIN_SDT_PROBE4(futex, linux_futex, debug_wait_value_neq, args->uaddr, args->val, val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x != uval 0x%x", args->uaddr, args->val, val); futex_put(f, wp); return (EWOULDBLOCK); } error = futex_wait(f, wp, args->ts, args->val3); break; case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTCREATE); if (error) return (error); if (f == NULL) { td->td_retval[0] = 0; return (error); } td->td_retval[0] = futex_wake(f, args->val, args->val3); futex_put(f, NULL); break; case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->ts); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->ts); /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE0(futex, linux_futex, invalid_cmp_requeue_use); return (EINVAL); } nrrequeue = (int)(unsigned long)args->ts; nrwake = args->val; /* * Sanity check to prevent signed integer overflow, * see Linux CVE-2018-6927 */ if (nrwake < 0 || nrrequeue < 0) return (EINVAL); retry1: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) return (error); /* * To avoid deadlocks return EINVAL if second futex * exists at this time. * * Glibc fall back to FUTEX_WAKE in case of any error * returned by FUTEX_CMP_REQUEUE. */ error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTEXISTS | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); return (error); } futex_lock(f); futex_lock(f2); error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry1; LIN_SDT_PROBE1(futex, linux_futex, copyin_error, error); LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", error); return (error); } if (val != args->val3) { LIN_SDT_PROBE2(futex, linux_futex, debug_cmp_requeue_value_neq, args->val, val); LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", args->val, val); futex_put(f2, NULL); futex_put(f, NULL); return (EAGAIN); } td->td_retval[0] = futex_requeue(f, nrwake, f2, nrrequeue); futex_put(f2, NULL); futex_put(f, NULL); break; case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->ts); if (args->uaddr == args->uaddr2) return (EINVAL); retry2: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) return (error); error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); return (error); } futex_lock(f); futex_lock(f2); /* * This function returns positive number as results and * negative as errors */ save = vm_fault_disable_pagefaults(); op_ret = futex_atomic_op(td, args->val3, args->uaddr2); vm_fault_enable_pagefaults(save); LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", args->uaddr, op_ret); if (op_ret < 0) { if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr2, &val, sizeof(val)); if (error == 0) goto retry2; return (error); } ret = futex_wake(f, args->val, args->val3); if (op_ret > 0) { op_ret = 0; nrwake = (int)(unsigned long)args->ts; if (f2 != NULL) op_ret += futex_wake(f2, nrwake, args->val3); else op_ret += futex_wake(f, nrwake, args->val3); ret += op_ret; } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); td->td_retval[0] = ret; break; case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_LOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_lock_pi); } return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_UNLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_unlock_pi); } return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_TRYLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_trylock_pi); } return (ENOSYS); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "unsupported FUTEX_REQUEUE"); pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_futex, deprecated_requeue); } return (EINVAL); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_wait_requeue_pi); } return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_cmp_requeue_pi); } return (ENOSYS); default: linux_msg(td, "unsupported futex op %d", args->op); LIN_SDT_PROBE1(futex, linux_futex, unknown_operation, args->op); return (ENOSYS); } return (error); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, }; struct l_timespec lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_sys_futex_time64(struct thread *td, struct linux_sys_futex_time64_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, }; struct l_timespec64 lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec64(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #endif int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) return (ESRCH); if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { PROC_UNLOCK(td2->td_proc); return (EPERM); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); return (EFAULT); } error = copyout(&head, args->head, sizeof(head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; struct futex *f; int error; retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); return (EFAULT); } if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) return (EFAULT); if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_get(uaddr, NULL, &f, FUTEX_DONTCREATE | FUTEX_SHARED); if (error) return (error); if (f != NULL) { futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); futex_put(f, NULL); } } } return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; head = em->robust_futexes; if (head == NULL) return; if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) return; error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) return; while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { return; } if (rc) return; entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); } diff --git a/sys/compat/linux/linux_vdso.c b/sys/compat/linux/linux_vdso.c index af90941d1981..f5fc7a20b1ef 100644 --- a/sys/compat/linux/linux_vdso.c +++ b/sys/compat/linux/linux_vdso.c @@ -1,184 +1,183 @@ /*- - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013-2021 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) #define __ELF_WORD_SIZE 32 #else #define __ELF_WORD_SIZE 64 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SLIST_HEAD(, linux_vdso_sym) __elfN(linux_vdso_syms) = SLIST_HEAD_INITIALIZER(__elfN(linux_vdso_syms)); void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *s) { SLIST_INSERT_HEAD(&__elfN(linux_vdso_syms), s, sym); } vm_object_t __elfN(linux_shared_page_init)(char **mapping, vm_size_t size) { vm_page_t m; vm_object_t obj; vm_offset_t addr; size_t n, pages; pages = size / PAGE_SIZE; addr = kva_alloc(size); obj = vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_WLOCK(obj); for (n = 0; n < pages; n++) { m = vm_page_grab(obj, n, VM_ALLOC_ZERO); vm_page_valid(m); vm_page_xunbusy(m); pmap_qenter(addr + n * PAGE_SIZE, &m, 1); } VM_OBJECT_WUNLOCK(obj); *mapping = (char *)addr; return (obj); } void __elfN(linux_shared_page_fini)(vm_object_t obj, void *mapping, vm_size_t size) { vm_offset_t va; va = (vm_offset_t)mapping; pmap_qremove(va, size / PAGE_SIZE); kva_free(va, size); vm_object_deallocate(obj); } void __elfN(linux_vdso_fixup)(char *base, vm_offset_t offset) { struct linux_vdso_sym *lsym; const Elf_Shdr *shdr; Elf_Ehdr *ehdr; Elf_Sym *dsym, *sym; char *strtab, *symname; int i, symcnt; ehdr = (Elf_Ehdr *)base; MPASS(IS_ELF(*ehdr)); MPASS(ehdr->e_ident[EI_CLASS] == ELF_TARG_CLASS); MPASS(ehdr->e_ident[EI_DATA] == ELF_TARG_DATA); MPASS(ehdr->e_ident[EI_VERSION] == EV_CURRENT); MPASS(ehdr->e_shentsize == sizeof(Elf_Shdr)); MPASS(ehdr->e_shoff != 0); MPASS(ehdr->e_type == ET_DYN); shdr = (const Elf_Shdr *)(base + ehdr->e_shoff); dsym = NULL; for (i = 0; i < ehdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; if (shdr[i].sh_type == SHT_DYNSYM) { dsym = (Elf_Sym *)(base + shdr[i].sh_offset); strtab = base + shdr[shdr[i].sh_link].sh_offset; symcnt = shdr[i].sh_size / sizeof(*dsym); break; } } MPASS(dsym != NULL); ehdr->e_ident[EI_OSABI] = ELFOSABI_LINUX; /* * VDSO is readonly mapped to the process VA and * can't be relocated by rtld. */ SLIST_FOREACH(lsym, &__elfN(linux_vdso_syms), sym) { for (i = 0, sym = dsym; i < symcnt; i++, sym++) { symname = strtab + sym->st_name; if (strncmp(lsym->symname, symname, lsym->size) == 0) { sym->st_value += offset; *lsym->ptr = sym->st_value; break; } } } } int linux_map_vdso(struct proc *p, vm_object_t obj, vm_offset_t base, vm_offset_t size, struct image_params *imgp) { struct vmspace *vmspace; vm_map_t map; int error; MPASS((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX); MPASS(obj != NULL); vmspace = p->p_vmspace; map = &vmspace->vm_map; vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, base, size, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } return (0); } diff --git a/sys/compat/linux/linux_vdso.h b/sys/compat/linux/linux_vdso.h index 870d0dd97fa2..6708b94f2694 100644 --- a/sys/compat/linux/linux_vdso.h +++ b/sys/compat/linux/linux_vdso.h @@ -1,67 +1,66 @@ /*- - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013-2021 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_VDSO_H_ #define _LINUX_VDSO_H_ #include struct linux_vdso_sym { SLIST_ENTRY(linux_vdso_sym) sym; uint32_t size; uintptr_t * ptr; char symname[]; }; vm_object_t __elfN(linux_shared_page_init)(char **, vm_size_t); void __elfN(linux_shared_page_fini)(vm_object_t, void *, vm_size_t); void __elfN(linux_vdso_fixup)(char *, vm_offset_t); void __elfN(linux_vdso_sym_init)(struct linux_vdso_sym *); int linux_map_vdso(struct proc *, vm_object_t, vm_offset_t, vm_offset_t, struct image_params *); #define LINUX_VDSO_SYM_INTPTR(name) \ uintptr_t name; \ LINUX_VDSO_SYM_DEFINE(name) #define LINUX_VDSO_SYM_CHAR(name) \ const char * name; \ LINUX_VDSO_SYM_DEFINE(name) #define LINUX_VDSO_SYM_DEFINE(name) \ static struct linux_vdso_sym name ## sym = { \ .symname = #name, \ .size = sizeof(#name), \ .ptr = (uintptr_t *)&name \ }; \ SYSINIT(__elfN(name ## _sym_init), SI_SUB_EXEC, \ SI_ORDER_FIRST, __elfN(linux_vdso_sym_init), &name ## sym); \ struct __hack #endif /* _LINUX_VDSO_H_ */ diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c index 8a884e4c9a55..91c045e85faf 100644 --- a/sys/kern/sys_eventfd.c +++ b/sys/kern/sys_eventfd.c @@ -1,347 +1,347 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2014 Dmitry Chagin + * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC"); _Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK"); MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures"); static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; static struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); static struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; static struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; int eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval, int flags) { struct eventfd *efd; int fflags; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_VALUE(initval); efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & EFD_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops); return (0); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); mtx_destroy(&efd->efd_lock); free(efd, M_EVENTFD); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; efd = fp->f_data; mtx_lock(&efd->efd_lock); while (error == 0 && efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdrd", 0); } if (error == 0) { MPASS(efd->efd_count > 0); if ((efd->efd_flags & EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); efd = fp->f_data; mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdwr", 0); if (error == 0) goto retry; } if (error == 0) { MPASS(UINT64_MAX - efd->efd_count > count); efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents; efd = fp->f_data; revents = 0; mtx_lock(&efd->efd_lock); if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0) revents |= events & (POLLIN | POLLRDNORM); if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT | POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd = fp->f_data; mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)efd->efd_count; ret = efd->efd_count > 0; return (ret); } static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count); ret = UINT64_MAX - 1 > efd->efd_count; return (ret); } static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { switch (cmd) { case FIONBIO: case FIOASYNC: return (0); } return (ENOTTY); } static int eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); st->st_mode = S_IFIFO; return (0); } static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct eventfd *efd = fp->f_data; kif->kf_type = KF_TYPE_EVENTFD; mtx_lock(&efd->efd_lock); kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count; kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags; mtx_unlock(&efd->efd_lock); return (0); } diff --git a/sys/x86/linux/linux_dummy_x86.c b/sys/x86/linux/linux_dummy_x86.c index d6a1ce38da48..c48443096180 100644 --- a/sys/x86/linux/linux_dummy_x86.c +++ b/sys/x86/linux/linux_dummy_x86.c @@ -1,59 +1,58 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2013 Dmitry Chagin - * All rights reserved. + * Copyright (c) 2013 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); DUMMY(sysfs); DUMMY(quotactl); /* Linux 2.6.13: */ DUMMY(inotify_init); /* Linux 2.6.22: */ DUMMY(signalfd);