diff --git a/share/man/man4/linux.4 b/share/man/man4/linux.4 index ee2629092100..ecf633f39648 100644 --- a/share/man/man4/linux.4 +++ b/share/man/man4/linux.4 @@ -1,188 +1,194 @@ .\" Copyright (c) 2000 Sheldon Hearn .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd January 9, 2022 .Dt LINUX 4 .Os .Sh NAME .Nm linux .Nd Linux ABI support .Sh SYNOPSIS To enable the Linux ABI at boot time, place the following line in .Xr rc.conf 5 : .Bd -literal -offset indent linux_enable="YES" .Ed .Sh DESCRIPTION The .Nm kernel module provides limited Linux ABI (application binary interface) compatibility, making it possible to run many unmodified Linux applications without the need for virtualization or emulation. Some of the facilities provided are: .Bl -bullet .It Linux to native system call translation .It Linux-specific system calls .It Special signal handling for Linux processes .It Path translation mechanism .It Linux-specific virtual file systems .El .Pp The path translation mechanism makes Linux processes look up file paths under .Va emul_path (defaulting to .Pa /compat/linux ) before .Pa / . For example, when Linux process attempts to open .Pa /etc/passwd , it will really access .Pa /compat/linux/etc/passwd , unless the latter does not exist. This is used to make sure Linux processes load Linux shared libraries instead of their similarly-named FreeBSD counterparts, and also to provide alternative versions of certain other files and virtual file systems. .Pp To install Linux shared libraries and system files into .Pa /compat/linux , either use the .Pa emulators/linux_base-c7 port or package, or .Xr debootstrap 8 installed from .Pa sysutils/debootstrap . .Pp To avoid mounting Linux-specific filesystems at startup, add the following line to the .Xr rc.conf 5 file: .Pp .Dl linux_mounts_enable="NO" .Sh SYSCTL VARIABLES The following variables are available as both .Xr sysctl 8 variables and .Xr loader 8 tunables: .Bl -tag -width indent .It Va compat.linux.debug Enable debugging messages. Set to 0 to silence them. Defaults to 3. A setting of 1 prints debug messages, tells about unimplemented stuff (only once). Set to 2 is like 1, but also prints messages about implemented but not tested stuff (only once). Setting it to 3 or higher is like 2, but no rate limiting of messages. .It Va compat.linux.default_openfiles Default soft openfiles resource limit for Linux applications. Set to -1 to disable the limit. Defaults to 1024. .It Va compat.linux.emul_path Path to the Linux run-time environment. Defaults to .Pa /compat/linux . .It Va compat.linux.osname Linux kernel operating system name. Defaults to "Linux". .It Va compat.linux.osrelease Linux kernel operating system release. Changing this to something else is discouraged on non-development systems, because it may change the way Linux programs work. Some versions of GNU libc are known to use different syscalls depending on the value of this sysctl. .It Va compat.linux.oss_version Linux Open Sound System version. Defaults to 198144. .It Va compat.linux.preserve_vstatus When set to 1, it prevents Linux applications from resetting the .Xr termios 4 VSTATUS setting. From a user perspective, this makes .Va SIGINFO work for Linux executables. Defaults to 1. .It Va compat.linux.setid_allowed Enable handling of set-user-ID and set-group-ID mode bits for the new process image file when image is to be executed under Linux ABI. When set to 0, new Linux images always use credentials of the program that issued the .Xr execve 2 call, regardless of the image file mode. This might be reasonable or even required, because .Fx does not emulate the Linux environment completely, and missed features may result in security vulnerabilities. Defaults to 1. +.It Va compat.linux32.emulate_i386 +In the x86_64 (amd64) world enable the real i386 Linuxulator behavior. +For example, when set to 0, Linux uname -m will return "x86_64" even if +uname itself is a i386 Linux executable. When set to 1, Linux i386 +uname -m will return "i686". +Defaults to 0. .El .Sh FILES .Bl -tag -width /compat/linux/dev/shm -compact .It Pa /compat/linux Linux run-time environment .It Pa /compat/linux/dev device file system, see .Xr devfs 5 .It Pa /compat/linux/dev/fd file descriptor file system mounted with the .Cm linrdlnk option, see .Xr fdescfs 5 .It Pa /compat/linux/dev/shm in-memory file system, see .Xr tmpfs 5 .It Pa /compat/linux/proc Linux process file system, see .Xr linprocfs 5 .It Pa /compat/linux/sys Linux kernel objects file system, see .Xr linsysfs 5 .El .Sh SEE ALSO .Xr brandelf 1 , .Xr pty 4 , .Xr elf 5 , .Xr fdescfs 5 , .Xr linprocfs 5 , .Xr linsysfs 5 , .Xr tmpfs 5 .Sh HISTORY Linux ABI support first appeared for i386 in .Fx 2.1 . Support for amd64 binaries first appeared in .Fx 10.3 . Support for arm64 binaries first appeared in .Fx 12.0 . .Sh BUGS Support for some of the Linux-specific system calls and system call arguments is missing. diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h index f1f877181e47..1273ff485dd9 100644 --- a/sys/amd64/linux32/linux.h +++ b/sys/amd64/linux32/linux.h @@ -1,640 +1,642 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2001 Doug Rabson * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _AMD64_LINUX_H_ #define _AMD64_LINUX_H_ #include #include #include #define LINUX_LEGACY_SYSCALLS #define LINUX_DTRACE linuxulator32 #define LINUX32_MAXDSIZ (512 * 1024 * 1024) /* 512MB */ #define LINUX32_MAXSSIZ (64 * 1024 * 1024) /* 64MB */ #define LINUX32_MAXVMEM 0 /* Unlimited */ /* * Provide a separate set of types for the Linux types. */ typedef int l_int; typedef int32_t l_long; typedef int64_t l_longlong; typedef short l_short; typedef unsigned int l_uint; typedef uint32_t l_ulong; typedef uint64_t l_ulonglong; typedef unsigned short l_ushort; typedef l_ulong l_uintptr_t; typedef l_long l_clock_t; typedef l_int l_daddr_t; typedef l_ushort l_dev_t; typedef l_uint l_gid_t; typedef l_ushort l_gid16_t; typedef l_ulong l_ino_t; typedef l_int l_key_t; typedef l_longlong l_loff_t; typedef l_ushort l_mode_t; typedef l_long l_off_t; typedef l_int l_pid_t; typedef l_uint l_size_t; typedef l_long l_suseconds_t; typedef l_long l_time_t; typedef l_longlong l_time64_t; typedef l_uint l_uid_t; typedef l_ushort l_uid16_t; typedef l_int l_timer_t; typedef l_int l_mqd_t; typedef l_ulong l_fd_mask; typedef struct { l_int val[2]; } l_fsid_t; typedef struct { l_time_t tv_sec; l_suseconds_t tv_usec; } l_timeval; #define l_fd_set fd_set /* * Miscellaneous */ #define LINUX_AT_COUNT 21 /* Count of used aux entry types. * Keep this synchronized with * linux_fixup_elf() code. */ struct l___sysctl_args { l_uintptr_t name; l_int nlen; l_uintptr_t oldval; l_uintptr_t oldlenp; l_uintptr_t newval; l_size_t newlen; l_ulong __spare[4]; }; /* Resource limits */ #define LINUX_RLIMIT_CPU 0 #define LINUX_RLIMIT_FSIZE 1 #define LINUX_RLIMIT_DATA 2 #define LINUX_RLIMIT_STACK 3 #define LINUX_RLIMIT_CORE 4 #define LINUX_RLIMIT_RSS 5 #define LINUX_RLIMIT_NPROC 6 #define LINUX_RLIMIT_NOFILE 7 #define LINUX_RLIMIT_MEMLOCK 8 #define LINUX_RLIMIT_AS 9 /* Address space limit */ #define LINUX_RLIM_NLIMITS 10 struct l_rlimit { l_ulong rlim_cur; l_ulong rlim_max; }; struct l_rusage { l_timeval ru_utime; l_timeval ru_stime; l_long ru_maxrss; l_long ru_ixrss; l_long ru_idrss; l_long ru_isrss; l_long ru_minflt; l_long ru_majflt; l_long ru_nswap; l_long ru_inblock; l_long ru_oublock; l_long ru_msgsnd; l_long ru_msgrcv; l_long ru_nsignals; l_long ru_nvcsw; l_long ru_nivcsw; }; struct l_mmap_argv { l_uintptr_t addr; l_size_t len; l_int prot; l_int flags; l_int fd; l_ulong pgoff; }; /* * stat family of syscalls */ struct l_timespec { l_time_t tv_sec; l_long tv_nsec; }; /* __kernel_timespec */ struct l_timespec64 { l_time64_t tv_sec; l_longlong tv_nsec; }; struct l_newstat { l_ushort st_dev; l_ushort __pad1; l_ulong st_ino; l_ushort st_mode; l_ushort st_nlink; l_ushort st_uid; l_ushort st_gid; l_ushort st_rdev; l_ushort __pad2; l_ulong st_size; l_ulong st_blksize; l_ulong st_blocks; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_ulong __unused4; l_ulong __unused5; }; struct l_stat { l_ushort st_dev; l_ulong st_ino; l_ushort st_mode; l_ushort st_nlink; l_ushort st_uid; l_ushort st_gid; l_ushort st_rdev; l_long st_size; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_long st_blksize; l_long st_blocks; l_ulong st_flags; l_ulong st_gen; }; struct l_stat64 { l_ushort st_dev; u_char __pad0[10]; l_ulong __st_ino; l_uint st_mode; l_uint st_nlink; l_ulong st_uid; l_ulong st_gid; l_ushort st_rdev; u_char __pad3[10]; l_longlong st_size; l_ulong st_blksize; l_ulong st_blocks; l_ulong __pad4; struct l_timespec st_atim; struct l_timespec st_mtim; struct l_timespec st_ctim; l_ulonglong st_ino; } __packed; struct l_statfs64 { l_int f_type; l_int f_bsize; uint64_t f_blocks; uint64_t f_bfree; uint64_t f_bavail; uint64_t f_files; uint64_t f_ffree; l_fsid_t f_fsid; l_int f_namelen; l_int f_frsize; l_int f_flags; l_int f_spare[4]; } __packed; /* sigaction flags */ #define LINUX_SA_NOCLDSTOP 0x00000001 #define LINUX_SA_NOCLDWAIT 0x00000002 #define LINUX_SA_SIGINFO 0x00000004 #define LINUX_SA_RESTORER 0x04000000 #define LINUX_SA_ONSTACK 0x08000000 #define LINUX_SA_RESTART 0x10000000 #define LINUX_SA_INTERRUPT 0x20000000 #define LINUX_SA_NOMASK 0x40000000 #define LINUX_SA_ONESHOT 0x80000000 /* sigprocmask actions */ #define LINUX_SIG_BLOCK 0 #define LINUX_SIG_UNBLOCK 1 #define LINUX_SIG_SETMASK 2 /* sigaltstack */ #define LINUX_MINSIGSTKSZ 2048 typedef l_uintptr_t l_handler_t; typedef l_ulong l_osigset_t; typedef struct { l_handler_t lsa_handler; l_osigset_t lsa_mask; l_ulong lsa_flags; l_uintptr_t lsa_restorer; } l_osigaction_t; typedef struct { l_handler_t lsa_handler; l_ulong lsa_flags; l_uintptr_t lsa_restorer; l_sigset_t lsa_mask; } __packed l_sigaction_t; typedef struct { l_uintptr_t ss_sp; l_int ss_flags; l_size_t ss_size; } l_stack_t; /* The Linux sigcontext, pretty much a standard 386 trapframe. */ struct l_sigcontext { l_uint sc_gs; l_uint sc_fs; l_uint sc_es; l_uint sc_ds; l_uint sc_edi; l_uint sc_esi; l_uint sc_ebp; l_uint sc_esp; l_uint sc_ebx; l_uint sc_edx; l_uint sc_ecx; l_uint sc_eax; l_uint sc_trapno; l_uint sc_err; l_uint sc_eip; l_uint sc_cs; l_uint sc_eflags; l_uint sc_esp_at_signal; l_uint sc_ss; l_uint sc_387; l_uint sc_mask; l_uint sc_cr2; }; struct l_ucontext { l_ulong uc_flags; l_uintptr_t uc_link; l_stack_t uc_stack; struct l_sigcontext uc_mcontext; l_sigset_t uc_sigmask; } __packed; #define LINUX_SI_MAX_SIZE 128 #define LINUX_SI_PAD_SIZE ((LINUX_SI_MAX_SIZE/sizeof(l_int)) - 3) typedef union l_sigval { l_int sival_int; l_uintptr_t sival_ptr; } l_sigval_t; typedef struct l_siginfo { l_int lsi_signo; l_int lsi_errno; l_int lsi_code; union { l_int _pad[LINUX_SI_PAD_SIZE]; struct { l_pid_t _pid; l_uid_t _uid; } _kill; struct { l_timer_t _tid; l_int _overrun; char _pad[sizeof(l_uid_t) - sizeof(l_int)]; l_sigval_t _sigval; l_int _sys_private; } _timer; struct { l_pid_t _pid; /* sender's pid */ l_uid_t _uid; /* sender's uid */ l_sigval_t _sigval; } _rt; struct { l_pid_t _pid; /* which child */ l_uid_t _uid; /* sender's uid */ l_int _status; /* exit code */ l_clock_t _utime; l_clock_t _stime; } _sigchld; struct { l_uintptr_t _addr; /* Faulting insn/memory ref. */ } _sigfault; struct { l_long _band; /* POLL_IN,POLL_OUT,POLL_MSG */ l_int _fd; } _sigpoll; } _sifields; } l_siginfo_t; #define lsi_pid _sifields._kill._pid #define lsi_uid _sifields._kill._uid #define lsi_tid _sifields._timer._tid #define lsi_overrun _sifields._timer._overrun #define lsi_sys_private _sifields._timer._sys_private #define lsi_status _sifields._sigchld._status #define lsi_utime _sifields._sigchld._utime #define lsi_stime _sifields._sigchld._stime #define lsi_value _sifields._rt._sigval #define lsi_int _sifields._rt._sigval.sival_int #define lsi_ptr _sifields._rt._sigval.sival_ptr #define lsi_addr _sifields._sigfault._addr #define lsi_band _sifields._sigpoll._band #define lsi_fd _sifields._sigpoll._fd struct l_fpreg { u_int16_t significand[4]; u_int16_t exponent; }; struct l_fpxreg { u_int16_t significand[4]; u_int16_t exponent; u_int16_t padding[3]; }; struct l_xmmreg { u_int32_t element[4]; }; struct l_fpstate { /* Regular FPU environment */ u_int32_t cw; u_int32_t sw; u_int32_t tag; u_int32_t ipoff; u_int32_t cssel; u_int32_t dataoff; u_int32_t datasel; struct l_fpreg _st[8]; u_int16_t status; u_int16_t magic; /* 0xffff = regular FPU data */ /* FXSR FPU environment */ u_int32_t _fxsr_env[6]; /* env is ignored. */ u_int32_t mxcsr; u_int32_t reserved; struct l_fpxreg _fxsr_st[8]; /* reg data is ignored. */ struct l_xmmreg _xmm[8]; u_int32_t padding[56]; }; /* * We make the stack look like Linux expects it when calling a signal * handler, but use the BSD way of calling the handler and sigreturn(). * This means that we need to pass the pointer to the handler too. * It is appended to the frame to not interfere with the rest of it. */ struct l_sigframe { l_int sf_sig; struct l_sigcontext sf_sc; struct l_fpstate sf_fpstate; l_uint sf_extramask[1]; l_handler_t sf_handler; }; struct l_rt_sigframe { l_int sf_sig; l_uintptr_t sf_siginfo; l_uintptr_t sf_ucontext; l_siginfo_t sf_si; struct l_ucontext sf_sc; l_handler_t sf_handler; } __packed; /* * arch specific open/fcntl flags */ #define LINUX_F_GETLK64 12 #define LINUX_F_SETLK64 13 #define LINUX_F_SETLKW64 14 union l_semun { l_int val; l_uintptr_t buf; l_uintptr_t array; l_uintptr_t __buf; l_uintptr_t __pad; }; struct l_ifmap { l_ulong mem_start; l_ulong mem_end; l_ushort base_addr; u_char irq; u_char dma; u_char port; /* 3 bytes spare */ }; struct l_ifreq { union { char ifrn_name[LINUX_IFNAMSIZ]; } ifr_ifrn; union { struct l_sockaddr ifru_addr; struct l_sockaddr ifru_dstaddr; struct l_sockaddr ifru_broadaddr; struct l_sockaddr ifru_netmask; struct l_sockaddr ifru_hwaddr; l_short ifru_flags[1]; l_int ifru_ivalue; l_int ifru_mtu; struct l_ifmap ifru_map; char ifru_slave[LINUX_IFNAMSIZ]; l_uintptr_t ifru_data; } ifr_ifru; }; #define ifr_name ifr_ifrn.ifrn_name /* Interface name */ #define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ #define ifr_ifindex ifr_ifru.ifru_ivalue /* Interface index */ struct l_ifconf { int ifc_len; union { l_uintptr_t ifcu_buf; l_uintptr_t ifcu_req; } ifc_ifcu; }; #define ifc_buf ifc_ifcu.ifcu_buf #define ifc_req ifc_ifcu.ifcu_req struct l_user_desc { l_uint entry_number; l_uint base_addr; l_uint limit; l_uint seg_32bit:1; l_uint contents:2; l_uint read_exec_only:1; l_uint limit_in_pages:1; l_uint seg_not_present:1; l_uint useable:1; }; #define LINUX_LOWERWORD 0x0000ffff /* * Macros which does the same thing as those in Linux include/asm-um/ldt-i386.h. * These convert Linux user space descriptor to machine one. */ #define LINUX_LDT_entry_a(info) \ ((((info)->base_addr & LINUX_LOWERWORD) << 16) | \ ((info)->limit & LINUX_LOWERWORD)) #define LINUX_ENTRY_B_READ_EXEC_ONLY 9 #define LINUX_ENTRY_B_CONTENTS 10 #define LINUX_ENTRY_B_SEG_NOT_PRESENT 15 #define LINUX_ENTRY_B_BASE_ADDR 16 #define LINUX_ENTRY_B_USEABLE 20 #define LINUX_ENTRY_B_SEG32BIT 22 #define LINUX_ENTRY_B_LIMIT 23 #define LINUX_LDT_entry_b(info) \ (((info)->base_addr & 0xff000000) | \ ((info)->limit & 0xf0000) | \ ((info)->contents << LINUX_ENTRY_B_CONTENTS) | \ (((info)->seg_not_present == 0) << LINUX_ENTRY_B_SEG_NOT_PRESENT) | \ (((info)->base_addr & 0x00ff0000) >> LINUX_ENTRY_B_BASE_ADDR) | \ (((info)->read_exec_only == 0) << LINUX_ENTRY_B_READ_EXEC_ONLY) | \ ((info)->seg_32bit << LINUX_ENTRY_B_SEG32BIT) | \ ((info)->useable << LINUX_ENTRY_B_USEABLE) | \ ((info)->limit_in_pages << LINUX_ENTRY_B_LIMIT) | 0x7000) #define LINUX_LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ (info)->seg_not_present == 1 && \ (info)->read_exec_only == 1 && \ (info)->seg_32bit == 0 && \ (info)->limit_in_pages == 0 && \ (info)->useable == 0) /* * Macros for converting segments. * They do the same as those in arch/i386/kernel/process.c in Linux. */ #define LINUX_GET_BASE(desc) \ ((((desc)->a >> 16) & LINUX_LOWERWORD) | \ (((desc)->b << 16) & 0x00ff0000) | \ ((desc)->b & 0xff000000)) #define LINUX_GET_LIMIT(desc) \ (((desc)->a & LINUX_LOWERWORD) | \ ((desc)->b & 0xf0000)) #define LINUX_GET_32BIT(desc) \ (((desc)->b >> LINUX_ENTRY_B_SEG32BIT) & 1) #define LINUX_GET_CONTENTS(desc) \ (((desc)->b >> LINUX_ENTRY_B_CONTENTS) & 3) #define LINUX_GET_WRITABLE(desc) \ (((desc)->b >> LINUX_ENTRY_B_READ_EXEC_ONLY) & 1) #define LINUX_GET_LIMIT_PAGES(desc) \ (((desc)->b >> LINUX_ENTRY_B_LIMIT) & 1) #define LINUX_GET_PRESENT(desc) \ (((desc)->b >> LINUX_ENTRY_B_SEG_NOT_PRESENT) & 1) #define LINUX_GET_USEABLE(desc) \ (((desc)->b >> LINUX_ENTRY_B_USEABLE) & 1) struct iovec; struct uio; struct l_iovec32 { uint32_t iov_base; l_size_t iov_len; }; int linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp, int error); int linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop); int linux_copyout_rusage(struct rusage *ru, void *uaddr); /* robust futexes */ struct linux_robust_list { l_uintptr_t next; }; struct linux_robust_list_head { struct linux_robust_list list; l_long futex_offset; l_uintptr_t pending_list; }; /* This corresponds to 'struct user_regs_struct32' in Linux. */ struct linux_pt_regset32 { l_uint ebx; l_uint ecx; l_uint edx; l_uint esi; l_uint edi; l_uint ebp; l_uint eax; l_uint ds; l_uint es; l_uint fs; l_uint gs; l_uint orig_eax; l_uint eip; l_uint cs; l_uint eflags; l_uint esp; l_uint ss; }; struct reg32; void bsd_to_linux_regset32(const struct reg32 *b_reg, struct linux_pt_regset32 *l_regset); +extern bool linux32_emulate_i386; + #endif /* !_AMD64_LINUX_H_ */ diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index e84b61580b42..227cefa92847 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -1,1227 +1,1230 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE) #define LINUX32_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX32_VDSOPAGE (LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE) #define LINUX32_SHAREDPAGE (LINUX32_VDSOPAGE - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX32_USRSTACK LINUX32_SHAREDPAGE static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux32_vdso_so_o_start; extern char _binary_linux32_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static void linux32_fixlimit(struct rlimit *rl, int which); static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); static void linux32_set_fork_retval(struct thread *td); static void linux32_set_syscall_retval(struct thread *td, int error); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary)); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp)); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf32_Addr *base; base = (Elf32_Addr *)*stack_base; base--; if (suword32(base, (uint32_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = __kernel_rt_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = __kernel_sigreturn; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ eflags = frame.sf_sc.sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; lmask.__mask = frame.sf_extramask[0]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ eflags = context->sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void linux32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_rax = bsd_to_linux_errno(error); } } static void linux32_set_fork_retval(struct thread *td) { struct trapframe *frame = td->td_frame; frame->tf_rax = 0; } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = (register_t)imgp->ps_strings; x86_clear_dbregs(pcb); fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); } /* * XXX copied from ia32_sysvec.c. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; u_int32_t *vectp; char *stringp; uintptr_t destp, ustringp; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; arginfo = (struct linux32_ps_strings *)PROC_PS_STRINGS(imgp->proc); destp = (uintptr_t)arginfo; if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(uint32_t)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(uint32_t)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(uint32_t)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(uint32_t)); } vectp = (uint32_t *)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword32(vectp++, 0) != 0) return (EFAULT); if (suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword32(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); +bool linux32_emulate_i386 = false; +SYSCTL_BOOL(_compat_linux32, OID_AUTO, emulate_i386, CTLFLAG_RWTUN, + &linux32_emulate_i386, 0, "Emulate the real i386"); static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_psstringssz = sizeof(struct linux32_ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux32_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, .sv_set_fork_retval = linux32_set_fork_retval, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error; error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX32_VDSOPAGE_SIZE, imgp); if (error == 0) linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_timekeep_base; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux32_vdso_so_o_start; char *vdso_end = &_binary_linux32_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE); linux_vdso_base = LINUX32_VDSOPAGE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Shdr *shdr; const Elf_Rel *rel; const Elf_Ehdr *ehdr; Elf32_Addr *where; Elf_Size rtype, symidx; Elf32_Addr addr, addend; int i, relcnt; MPASS(offset != 0); relcnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); relcnt = shdr[i].sh_size / sizeof(*rel); break; case SHT_RELA: printf("Linux i386 vDSO: unexpected Rela section\n"); break; } } for (i = 0; i < relcnt; i++, rel++) { where = (Elf32_Addr *)(mapping + rel->r_offset); addend = *where; rtype = ELF_R_TYPE(rel->r_info); symidx = ELF_R_SYM(rel->r_info); switch (rtype) { case R_386_NONE: /* none */ break; case R_386_RELATIVE: /* B + A */ addr = (Elf32_Addr)PTROUT(offset + addend); if (*where != addr) *where = addr; break; case R_386_IRELATIVE: printf("Linux i386 vDSO: unexpected ifunc relocation, " "symbol index %ld\n", (intmax_t)symidx); break; default: printf("Linux i386 vDSO: unexpected relocation type %ld, " "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx); } } } static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE | LINUX_BI_FUTEX_REQUEUE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux i386 ELF exec handler installed\n"); } else printf("cannot insert Linux i386 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux i386 ELF exec handler removed\n"); } else printf("Could not deinstall Linux i386 ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c index fa7d364681dc..b1f465ef5a9e 100644 --- a/sys/compat/linux/linux_misc.c +++ b/sys/compat/linux/linux_misc.c @@ -1,2915 +1,2920 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #if defined(__i386__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include #include int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalhigh; l_ulong freehigh; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; struct l_pselect6arg { l_uintptr_t ss; l_size_t ss_len; }; static int linux_utimensat_lts_to_ts(struct l_timespec *, struct timespec *); #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int linux_utimensat_lts64_to_ts(struct l_timespec64 *, struct timespec *); #endif static int linux_common_utimensat(struct thread *, int, const char *, struct timespec *, int); static int linux_common_pselect6(struct thread *, l_int, l_fd_set *, l_fd_set *, l_fd_set *, struct timespec *, l_uintptr_t *); static int linux_common_ppoll(struct thread *, struct pollfd *, uint32_t, struct timespec *, l_sigset_t *, l_size_t); static int linux_pollin(struct thread *, struct pollfd *, struct pollfd *, u_int); static int linux_pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; int i, j; struct timespec ts; bzero(&sysinfo, sizeof(sysinfo)); getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE; /* * sharedram counts pages allocated to named, swap-backed objects such * as shared memory segments and tmpfs files. There is no cheap way to * compute this, so just leave the field unpopulated. Linux itself only * started setting this field in the 3.x timeframe. */ sysinfo.sharedram = 0; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* * Platforms supported by the emulation layer do not have a notion of * high memory. */ sysinfo.totalhigh = 0; sysinfo.freehigh = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } #ifdef LINUX_LEGACY_SYSCALLS int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error __diagused; secs = args->secs; /* * Linux alarm() is always successful. Limit secs to INT32_MAX / 2 * to match kern_setitimer()'s limit to avoid error from it. * * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit * platforms. */ if (secs > INT32_MAX / 2) secs = INT32_MAX / 2; it.it_value.tv_sec = secs; it.it_value.tv_usec = 0; timevalclear(&it.it_interval); error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); KASSERT(error == 0, ("kern_setitimer returns %d", error)); if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) || old_it.it_value.tv_usec >= 500000) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; return (0); } #endif int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; uintptr_t new, old; old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (uintptr_t)args->dsend; if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new)) td->td_retval[0] = (register_t)new; else td->td_retval[0] = (register_t)old; return (0); } #if defined(__i386__) /* XXX: what about amd64/linux32? */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; vm_map_t map; vm_map_entry_t entry; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; char *library; ssize_t aresid; int error; bool locked, opened, textset; a_out = NULL; vp = NULL; locked = false; textset = false; opened = false; if (!LUSECONVPATH(td)) { NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, args->library, td); error = namei(&ni); } else { LCONVPATHEXIST(args->library, &library); NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); } if (error) goto cleanup; vp = ni.ni_vp; NDFREE(&ni, NDF_ONLY_PNBUF); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = true; /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; opened = true; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. */ error = VOP_SET_TEXT(vp); if (error != 0) goto cleanup; textset = true; /* * Lock no longer needed */ locked = false; VOP_UNLOCK(vp); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ map = &td->td_proc->p_vmspace->vm_map; error = vm_mmap(map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; vm_map_lock(map); if (!vm_map_lookup_entry(map, vmaddr, &entry)) { vm_map_unlock(map); error = EDOOFUS; goto cleanup; } entry->eflags |= MAP_ENTRY_VN_EXEC; vm_map_unlock(map); textset = false; } if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: if (opened) { if (locked) VOP_UNLOCK(vp); locked = false; VOP_CLOSE(vp, FREAD, td->td_ucred, td); } if (textset) { if (!locked) { locked = true; VOP_LOCK(vp, LK_SHARED | LK_RETRY); } VOP_UNSET_TEXT_CHECKED(vp); } if (locked) VOP_UNLOCK(vp); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } #endif /* __i386__ */ #ifdef LINUX_LEGACY_SYSCALLS int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, LINUX_NFDBITS); if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: return (error); } #endif int linux_mremap(struct thread *td, struct linux_mremap_args *args) { uintptr_t addr; size_t len; int error = 0; if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { addr = args->addr + args->new_len; len = args->old_len - args->new_len; error = kern_munmap(td, addr, len); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { return (kern_msync(td, args->addr, args->len, args->fl & ~LINUX_MS_SYNC)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } #endif struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } #if defined(__amd64__) /* * On amd64, Linux uname(2) needs to return "x86_64" * for both 64-bit and 32-bit applications. On 32-bit, * the string returned by getauxval(AT_PLATFORM) needs * to remain "i686", though. */ +#if defined(COMPAT_LINUX32) + if (linux32_emulate_i386) + strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME); + else +#endif strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME); #elif defined(__aarch64__) strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME); #elif defined(__i386__) strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME); #endif return (copyout(&utsname, args->buf, sizeof(utsname))); } struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; #ifdef LINUX_LEGACY_SYSCALLS int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; char *fname; int error; if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut)) != 0) return (error); tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; if (!LUSECONVPATH(td)) { error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, tvp, UIO_SYSSPACE); } else { LCONVPATHEXIST(args->fname, &fname); error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); } return (error); } #endif #ifdef LINUX_LEGACY_SYSCALLS int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error; if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0) return (error); tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } if (!LUSECONVPATH(td)) { error = kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE, tvp, UIO_SYSSPACE); } else { LCONVPATHEXIST(args->fname, &fname); error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); } return (error); } #endif static int linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times) { if (l_times->tv_nsec != LINUX_UTIME_OMIT && l_times->tv_nsec != LINUX_UTIME_NOW && (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999)) return (EINVAL); times->tv_sec = l_times->tv_sec; switch (l_times->tv_nsec) { case LINUX_UTIME_OMIT: times->tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times->tv_nsec = UTIME_NOW; break; default: times->tv_nsec = l_times->tv_nsec; } return (0); } static int linux_common_utimensat(struct thread *td, int ldfd, const char *pathname, struct timespec *timesp, int lflags) { char *path = NULL; int error, dfd, flags = 0; dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd; if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH)) return (EINVAL); if (timesp != NULL) { /* This breaks POSIX, but is what the Linux kernel does * _on purpose_ (documented in the man page for utimensat(2)), * so we must follow that behaviour. */ if (timesp[0].tv_nsec == UTIME_OMIT && timesp[1].tv_nsec == UTIME_OMIT) return (0); } if (lflags & LINUX_AT_SYMLINK_NOFOLLOW) flags |= AT_SYMLINK_NOFOLLOW; if (lflags & LINUX_AT_EMPTY_PATH) flags |= AT_EMPTY_PATH; if (!LUSECONVPATH(td)) { if (pathname != NULL) { return (kern_utimensat(td, dfd, pathname, UIO_USERSPACE, timesp, UIO_SYSSPACE, flags)); } } if (pathname != NULL) LCONVPATHEXIST_AT(pathname, &path, dfd); else if (lflags != 0) return (EINVAL); if (path == NULL) error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE); else { error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp, UIO_SYSSPACE, flags); LFREEPATH(path); } return (error); } int linux_utimensat(struct thread *td, struct linux_utimensat_args *args) { struct l_timespec l_times[2]; struct timespec times[2], *timesp; int error; if (args->times != NULL) { error = copyin(args->times, l_times, sizeof(l_times)); if (error != 0) return (error); error = linux_utimensat_lts_to_ts(&l_times[0], ×[0]); if (error != 0) return (error); error = linux_utimensat_lts_to_ts(&l_times[1], ×[1]); if (error != 0) return (error); timesp = times; } else timesp = NULL; return (linux_common_utimensat(td, args->dfd, args->pathname, timesp, args->flags)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times) { if (l_times->tv_nsec != LINUX_UTIME_OMIT && l_times->tv_nsec != LINUX_UTIME_NOW && (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999)) return (EINVAL); times->tv_sec = l_times->tv_sec; switch (l_times->tv_nsec) { case LINUX_UTIME_OMIT: times->tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times->tv_nsec = UTIME_NOW; break; default: times->tv_nsec = l_times->tv_nsec; } return (0); } int linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args) { struct l_timespec64 l_times[2]; struct timespec times[2], *timesp; int error; if (args->times64 != NULL) { error = copyin(args->times64, l_times, sizeof(l_times)); if (error != 0) return (error); error = linux_utimensat_lts64_to_ts(&l_times[0], ×[0]); if (error != 0) return (error); error = linux_utimensat_lts64_to_ts(&l_times[1], ×[1]); if (error != 0) return (error); timesp = times; } else timesp = NULL; return (linux_common_utimensat(td, args->dfd, args->pathname, timesp, args->flags)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ #ifdef LINUX_LEGACY_SYSCALLS int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0) return (error); tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } if (!LUSECONVPATH(td)) { error = kern_utimesat(td, dfd, args->filename, UIO_USERSPACE, tvp, UIO_SYSSPACE); } else { LCONVPATHEXIST_AT(args->filename, &fname, dfd); error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); } return (error); } #endif static int linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp, int options, void *rup, l_siginfo_t *infop) { l_siginfo_t lsi; siginfo_t siginfo; struct __wrusage wru; int error, status, tmpstat, sig; error = kern_wait6(td, idtype, id, &status, options, rup != NULL ? &wru : NULL, &siginfo); if (error == 0 && statusp) { tmpstat = status & 0xffff; if (WIFSIGNALED(tmpstat)) { tmpstat = (tmpstat & 0xffffff80) | bsd_to_linux_signal(WTERMSIG(tmpstat)); } else if (WIFSTOPPED(tmpstat)) { tmpstat = (tmpstat & 0xffff00ff) | (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8); #if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32)) if (WSTOPSIG(status) == SIGTRAP) { tmpstat = linux_ptrace_status(td, siginfo.si_pid, tmpstat); } #endif } else if (WIFCONTINUED(tmpstat)) { tmpstat = 0xffff; } error = copyout(&tmpstat, statusp, sizeof(int)); } if (error == 0 && rup != NULL) error = linux_copyout_rusage(&wru.wru_self, rup); if (error == 0 && infop != NULL && td->td_retval[0] != 0) { sig = bsd_to_linux_signal(siginfo.si_signo); siginfo_to_lsiginfo(&siginfo, &lsi, sig); error = copyout(&lsi, infop, sizeof(lsi)); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { struct linux_wait4_args wait4_args; wait4_args.pid = args->pid; wait4_args.status = args->status; wait4_args.options = args->options; wait4_args.rusage = NULL; return (linux_wait4(td, &wait4_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_wait4(struct thread *td, struct linux_wait4_args *args) { struct proc *p; int options, id, idtype; if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); /* -INT_MIN is not defined. */ if (args->pid == INT_MIN) return (ESRCH); options = 0; linux_to_bsd_waitopts(args->options, &options); /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; /* * As FreeBSD does not have __WALL option bit analogue explicitly set all * possible option bits to emulate Linux __WALL wait option bit. The same * for waitid system call. */ if ((args->options & __WALL) != 0) options |= WUNTRACED | WCONTINUED | WLINUXCLONE; if (args->pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (args->pid < 0) { idtype = P_PGID; id = (id_t)-args->pid; } else if (args->pid == 0) { idtype = P_PGID; p = td->td_proc; PROC_LOCK(p); id = p->p_pgid; PROC_UNLOCK(p); } else { idtype = P_PID; id = (id_t)args->pid; } return (linux_common_wait(td, idtype, id, args->status, options, args->rusage, NULL)); } int linux_waitid(struct thread *td, struct linux_waitid_args *args) { idtype_t idtype; int error, options; struct proc *p; pid_t id; if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED | LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); options = 0; linux_to_bsd_waitopts(args->options, &options); if ((args->options & __WALL) != 0) options |= WEXITED | WTRAPPED | WUNTRACED | WCONTINUED | WLINUXCLONE; id = args->id; switch (args->idtype) { case LINUX_P_ALL: idtype = P_ALL; break; case LINUX_P_PID: if (args->id <= 0) return (EINVAL); idtype = P_PID; break; case LINUX_P_PGID: if (linux_use54(td) && args->id == 0) { p = td->td_proc; PROC_LOCK(p); id = p->p_pgid; PROC_UNLOCK(p); } else if (args->id <= 0) return (EINVAL); idtype = P_PGID; break; case LINUX_P_PIDFD: LINUX_RATELIMIT_MSG("unsupported waitid P_PIDFD idtype"); return (ENOSYS); default: return (EINVAL); } error = linux_common_wait(td, idtype, id, NULL, options, args->rusage, args->info); td->td_retval[0] = 0; return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_mknod(struct thread *td, struct linux_mknod_args *args) { char *path; int error; enum uio_seg seg; bool convpath; convpath = LUSECONVPATH(td); if (!convpath) { path = args->path; seg = UIO_USERSPACE; } else { LCONVPATHCREAT(args->path, &path); seg = UIO_SYSSPACE; } switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, AT_FDCWD, path, seg, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, AT_FDCWD, path, seg, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, AT_FDCWD, path, seg, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } if (convpath) LFREEPATH(path); return (error); } #endif int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { char *path; int error, dfd; enum uio_seg seg; bool convpath; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; convpath = LUSECONVPATH(td); if (!convpath) { path = __DECONST(char *, args->filename); seg = UIO_USERSPACE; } else { LCONVPATHCREAT_AT(args->filename, &path, dfd); seg = UIO_SYSSPACE; } switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, dfd, path, seg, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, dfd, path, seg, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, dfd, path, seg, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } if (convpath) LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { struct linux_pemuldata *pem; struct proc *p = td->td_proc; uint32_t old; PROC_LOCK(p); pem = pem_find(p); old = pem->persona; if (args->per != 0xffffffff) pem->persona = args->per; PROC_UNLOCK(p); td->td_retval[0] = old; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_nice(struct thread *td, struct linux_nice_args *args) { return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); crextend(newcred, ngrp + 1); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; crcopy(newcred, oldcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_LINUX); if (error) return (error); td->td_retval[0] = ngrp; return (0); } static bool linux_get_dummy_limit(l_uint resource, struct rlimit *rlim) { if (linux_dummy_rlimits == 0) return (false); switch (resource) { case LINUX_RLIMIT_LOCKS: case LINUX_RLIMIT_SIGPENDING: case LINUX_RLIMIT_MSGQUEUE: case LINUX_RLIMIT_RTTIME: rlim->rlim_cur = LINUX_RLIM_INFINITY; rlim->rlim_max = LINUX_RLIM_INFINITY; return (true); case LINUX_RLIMIT_NICE: case LINUX_RLIMIT_RTPRIO: rlim->rlim_cur = 0; rlim->rlim_max = 0; return (true); default: return (false); } } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; if (linux_get_dummy_limit(args->resource, &bsd_rlim)) { rlim.rlim_cur = bsd_rlim.rlim_cur; rlim.rlim_max = bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; if (linux_get_dummy_limit(args->resource, &bsd_rlim)) { rlim.rlim_cur = bsd_rlim.rlim_cur; rlim.rlim_max = bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { struct sched_param sched_param; struct thread *tdt; int error, policy; switch (args->policy) { case LINUX_SCHED_OTHER: policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: policy = SCHED_FIFO; break; case LINUX_SCHED_RR: policy = SCHED_RR; break; default: return (EINVAL); } error = copyin(args->param, &sched_param, sizeof(sched_param)); if (error) return (error); if (linux_map_sched_prio) { switch (policy) { case SCHED_OTHER: if (sched_param.sched_priority != 0) return (EINVAL); sched_param.sched_priority = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE; break; case SCHED_FIFO: case SCHED_RR: if (sched_param.sched_priority < 1 || sched_param.sched_priority >= LINUX_MAX_RT_PRIO) return (EINVAL); /* * Map [1, LINUX_MAX_RT_PRIO - 1] to * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down). */ sched_param.sched_priority = (sched_param.sched_priority - 1) * (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) / (LINUX_MAX_RT_PRIO - 1); break; } } tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setscheduler(td, tdt, policy, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { struct thread *tdt; int error, policy; tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; if (linux_map_sched_prio) { switch (args->policy) { case LINUX_SCHED_OTHER: td->td_retval[0] = 0; return (0); case LINUX_SCHED_FIFO: case LINUX_SCHED_RR: td->td_retval[0] = LINUX_MAX_RT_PRIO - 1; return (0); default: return (EINVAL); } } switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; if (linux_map_sched_prio) { switch (args->policy) { case LINUX_SCHED_OTHER: td->td_retval[0] = 0; return (0); case LINUX_SCHED_FIFO: case LINUX_SCHED_RR: td->td_retval[0] = 1; return (0); default: return (EINVAL); } } switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } int linux_getpid(struct thread *td, struct linux_getpid_args *args) { td->td_retval[0] = td->td_proc->p_pid; return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("gettid: emuldata not found.\n")); td->td_retval[0] = em->em_tid; return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { td->td_retval[0] = kern_getppid(td); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { return (kern_getsid(td, args->pid)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { int error; error = kern_getpriority(td, args->which, args->who); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, args->error_code, 0); /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION_1 0x19980330 #define _LINUX_CAPABILITY_VERSION_2 0x20071026 #define _LINUX_CAPABILITY_VERSION_3 0x20080522 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, u32s; if (uap->hdrp == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (uap->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ memset(&lucd, 0, u32s * sizeof(lucd[0])); error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0])); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, i, u32s; if (uap->hdrp == NULL || uap->datap == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0])); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ for (i = 0; i < u32s; i++) { if (lucd[i].effective || lucd[i].permitted || lucd[i].inheritable) { linux_msg(td, "capset[%d] effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", i, (int)lucd[i].effective, (int)lucd[i].permitted, (int)lucd[i].inheritable); return (EPERM); } } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size, arg; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; int pdeath_signal, trace_state; switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); pdeath_signal = linux_to_bsd_signal(args->arg2); return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL, &pdeath_signal)); case LINUX_PR_GET_PDEATHSIG: error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS, &pdeath_signal); if (error != 0) return (error); pdeath_signal = bsd_to_linux_signal(pdeath_signal); return (copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal))); /* * In Linux, this flag controls if set[gu]id processes can coredump. * There are additional semantics imposed on processes that cannot * coredump: * - Such processes can not be ptraced. * - There are some semantics around ownership of process-related files * in the /proc namespace. * * In FreeBSD, we can (and by default, do) disable setuid coredump * system-wide with 'sugid_coredump.' We control tracability on a * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag). * By happy coincidence, P2_NOTRACE also prevents coredumping. So the * procctl is roughly analogous to Linux's DUMPABLE. * * So, proxy these knobs to the corresponding PROC_TRACE setting. */ case LINUX_PR_GET_DUMPABLE: error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS, &trace_state); if (error != 0) return (error); td->td_retval[0] = (trace_state != -1); return (0); case LINUX_PR_SET_DUMPABLE: /* * It is only valid for userspace to set one of these two * flags, and only one at a time. */ switch (args->arg2) { case LINUX_SUID_DUMP_DISABLE: trace_state = PROC_TRACE_CTL_DISABLE_EXEC; break; case LINUX_SUID_DUMP_USER: trace_state = PROC_TRACE_CTL_ENABLE; break; default: return (EINVAL); } return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL, &trace_state)); case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a Linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; case LINUX_PR_GET_SECCOMP: case LINUX_PR_SET_SECCOMP: /* * Same as returned by Linux without CONFIG_SECCOMP enabled. */ error = EINVAL; break; case LINUX_PR_CAPBSET_READ: #if 0 /* * This makes too much noise with Ubuntu Focal. */ linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d", (int)args->arg2); #endif error = EINVAL; break; case LINUX_PR_SET_NO_NEW_PRIVS: arg = args->arg2 == 1 ? PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE; error = kern_procctl(td, P_PID, p->p_pid, PROC_NO_NEW_PRIVS_CTL, &arg); break; case LINUX_PR_SET_PTRACER: linux_msg(td, "unsupported prctl PR_SET_PTRACER"); error = EINVAL; break; default: linux_msg(td, "unsupported prctl option %d", args->option); error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error, policy; error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); if (linux_map_sched_prio) { error = kern_sched_getscheduler(td, tdt, &policy); if (error) goto out; switch (policy) { case SCHED_OTHER: if (sched_param.sched_priority != 0) { error = EINVAL; goto out; } sched_param.sched_priority = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE; break; case SCHED_FIFO: case SCHED_RR: if (sched_param.sched_priority < 1 || sched_param.sched_priority >= LINUX_MAX_RT_PRIO) { error = EINVAL; goto out; } /* * Map [1, LINUX_MAX_RT_PRIO - 1] to * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down). */ sched_param.sched_priority = (sched_param.sched_priority - 1) * (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) / (LINUX_MAX_RT_PRIO - 1); break; } } error = kern_sched_setparam(td, tdt, &sched_param); out: PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error, policy; tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); if (error) { PROC_UNLOCK(tdt->td_proc); return (error); } if (linux_map_sched_prio) { error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); if (error) return (error); switch (policy) { case SCHED_OTHER: sched_param.sched_priority = 0; break; case SCHED_FIFO: case SCHED_RR: /* * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to * [1, LINUX_MAX_RT_PRIO - 1] (rounding up). */ sched_param.sched_priority = (sched_param.sched_priority * (LINUX_MAX_RT_PRIO - 1) + (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) / (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1; break; } } else PROC_UNLOCK(tdt->td_proc); error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } static const struct cpuset_copy_cb copy_set = { .cpuset_copyin = copyin, .cpuset_copyout = copyout }; /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { int error; struct thread *tdt; if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr, ©_set); if (error == 0) td->td_retval[0] = sizeof(cpuset_t); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct thread *tdt; if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr, ©_set)); } struct linux_rlimit64 { uint64_t rlim_cur; uint64_t rlim_max; }; int linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args) { struct rlimit rlim, nrlim; struct linux_rlimit64 lrlim; struct proc *p; u_int which; int flags; int error; if (args->new == NULL && args->old != NULL) { if (linux_get_dummy_limit(args->resource, &rlim)) { lrlim.rlim_cur = rlim.rlim_cur; lrlim.rlim_max = rlim.rlim_max; return (copyout(&lrlim, args->old, sizeof(lrlim))); } } if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); if (args->new != NULL) { /* * Note. Unlike FreeBSD where rlim is signed 64-bit Linux * rlim is unsigned 64-bit. FreeBSD treats negative limits * as INFINITY so we do not need a conversion even. */ error = copyin(args->new, &nrlim, sizeof(nrlim)); if (error != 0) return (error); } flags = PGET_HOLD | PGET_NOTWEXIT; if (args->new != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; if (args->pid == 0) { p = td->td_proc; PHOLD(p); } else { error = pget(args->pid, flags, &p); if (error != 0) return (error); } if (args->old != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur == RLIM_INFINITY) lrlim.rlim_cur = LINUX_RLIM_INFINITY; else lrlim.rlim_cur = rlim.rlim_cur; if (rlim.rlim_max == RLIM_INFINITY) lrlim.rlim_max = LINUX_RLIM_INFINITY; else lrlim.rlim_max = rlim.rlim_max; error = copyout(&lrlim, args->old, sizeof(lrlim)); if (error != 0) goto out; } if (args->new != NULL) error = kern_proc_setrlimit(td, p, which, &nrlim); out: PRELE(p); return (error); } int linux_pselect6(struct thread *td, struct linux_pselect6_args *args) { struct l_timespec lts; struct timespec ts, *tsp; int error; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&ts, <s); if (error != 0) return (error); tsp = &ts; } else tsp = NULL; error = linux_common_pselect6(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tsp, args->sig); if (error != 0) return (error); if (args->tsp != NULL) { error = native_to_linux_timespec(<s, tsp); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } static int linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds, l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp, l_uintptr_t *sig) { struct timeval utv, tv0, tv1, *tvp; struct l_pselect6arg lpse6; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; ssp = NULL; if (sig != NULL) { error = copyin(sig, &lpse6, sizeof(lpse6)); if (error != 0) return (error); if (lpse6.ss_len != sizeof(l_ss)) return (EINVAL); if (lpse6.ss != 0) { error = copyin(PTRIN(lpse6.ss), &l_ss, sizeof(l_ss)); if (error != 0) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } } else ssp = NULL; /* * Currently glibc changes nanosecond number to microsecond. * This mean losing precision but for now it is hardly seen. */ if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&utv, tsp); if (itimerfix(&utv)) return (EINVAL); microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_pselect(td, nfds, readfds, writefds, exceptfds, tvp, ssp, LINUX_NFDBITS); if (error == 0 && tsp != NULL) { if (td->td_retval[0] != 0) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); TIMEVAL_TO_TIMESPEC(&utv, tsp); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_pselect6_time64(struct thread *td, struct linux_pselect6_time64_args *args) { struct l_timespec64 lts; struct timespec ts, *tsp; int error; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec64(&ts, <s); if (error != 0) return (error); tsp = &ts; } else tsp = NULL; error = linux_common_pselect6(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tsp, args->sig); if (error != 0) return (error); if (args->tsp != NULL) { error = native_to_linux_timespec64(<s, tsp); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_ppoll(struct thread *td, struct linux_ppoll_args *args) { struct timespec uts, *tsp; struct l_timespec lts; int error; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error) return (error); error = linux_to_native_timespec(&uts, <s); if (error != 0) return (error); tsp = &uts; } else tsp = NULL; error = linux_common_ppoll(td, args->fds, args->nfds, tsp, args->sset, args->ssize); if (error != 0) return (error); if (tsp != NULL) { error = native_to_linux_timespec(<s, tsp); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } static int linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds, struct timespec *tsp, l_sigset_t *sset, l_size_t ssize) { struct timespec ts0, ts1; struct pollfd stackfds[32]; struct pollfd *kfds; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; if (kern_poll_maxfds(nfds)) return (EINVAL); if (sset != NULL) { if (ssize != sizeof(l_ss)) return (EINVAL); error = copyin(sset, &l_ss, sizeof(l_ss)); if (error) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } else ssp = NULL; if (tsp != NULL) nanotime(&ts0); if (nfds > nitems(stackfds)) kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); else kfds = stackfds; error = linux_pollin(td, kfds, fds, nfds); if (error != 0) goto out; error = kern_poll_kfds(td, kfds, nfds, tsp, ssp); if (error == 0) error = linux_pollout(td, kfds, fds, nfds); if (error == 0 && tsp != NULL) { if (td->td_retval[0]) { nanotime(&ts1); timespecsub(&ts1, &ts0, &ts1); timespecsub(tsp, &ts1, tsp); if (tsp->tv_sec < 0) timespecclear(tsp); } else timespecclear(tsp); } out: if (nfds > nitems(stackfds)) free(kfds, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args) { struct timespec uts, *tsp; struct l_timespec64 lts; int error; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec64(&uts, <s); if (error != 0) return (error); tsp = &uts; } else tsp = NULL; error = linux_common_ppoll(td, args->fds, args->nfds, tsp, args->sset, args->ssize); if (error != 0) return (error); if (tsp != NULL) { error = native_to_linux_timespec64(<s, tsp); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ static int linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) { int error; u_int i; error = copyin(ufds, fds, nfd * sizeof(*fds)); if (error != 0) return (error); for (i = 0; i < nfd; i++) { if (fds->events != 0) linux_to_bsd_poll_events(td, fds->fd, fds->events, &fds->events); fds++; } return (0); } static int linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) { int error = 0; u_int i, n = 0; for (i = 0; i < nfd; i++) { if (fds->revents != 0) { bsd_to_linux_poll_events(fds->revents, &fds->revents); n++; } error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); fds++; ufds++; } td->td_retval[0] = n; return (0); } int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; struct l_timespec lts; struct thread *tdt; int error; /* * According to man in case the invalid pid specified * EINVAL should be returned. */ if (uap->pid < 0) return (EINVAL); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, &ts); PROC_UNLOCK(tdt->td_proc); if (error != 0) return (error); error = native_to_linux_timespec(<s, &ts); if (error != 0) return (error); return (copyout(<s, uap->interval, sizeof(lts))); } /* * In case when the Linux thread is the initial thread in * the thread group thread id is equal to the process id. * Glibc depends on this magic (assert in pthread_getattr_np.c). */ struct thread * linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) { struct linux_emuldata *em; struct thread *tdt; struct proc *p; tdt = NULL; if (tid == 0 || tid == td->td_tid) { tdt = td; PROC_LOCK(tdt->td_proc); } else if (tid > PID_MAX) tdt = tdfind(tid, pid); else { /* * Initial thread where the tid equal to the pid. */ p = pfind(tid); if (p != NULL) { if (SV_PROC_ABI(p) != SV_ABI_LINUX) { /* * p is not a Linuxulator process. */ PROC_UNLOCK(p); return (NULL); } FOREACH_THREAD_IN_PROC(p, tdt) { em = em_find(tdt); if (tid == em->em_tid) return (tdt); } PROC_UNLOCK(p); } return (NULL); } return (tdt); } void linux_to_bsd_waitopts(int options, int *bsdopts) { if (options & LINUX_WNOHANG) *bsdopts |= WNOHANG; if (options & LINUX_WUNTRACED) *bsdopts |= WUNTRACED; if (options & LINUX_WEXITED) *bsdopts |= WEXITED; if (options & LINUX_WCONTINUED) *bsdopts |= WCONTINUED; if (options & LINUX_WNOWAIT) *bsdopts |= WNOWAIT; if (options & __WCLONE) *bsdopts |= WLINUXCLONE; } int linux_getrandom(struct thread *td, struct linux_getrandom_args *args) { struct uio uio; struct iovec iov; int error; if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM)) return (EINVAL); if (args->count > INT_MAX) args->count = INT_MAX; iov.iov_base = args->buf; iov.iov_len = args->count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = iov.iov_len; uio.uio_segflg = UIO_USERSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK); if (error == 0) td->td_retval[0] = args->count - uio.uio_resid; return (error); } int linux_mincore(struct thread *td, struct linux_mincore_args *args) { /* Needs to be page-aligned */ if (args->start & PAGE_MASK) return (EINVAL); return (kern_mincore(td, args->start, args->len, args->vec)); } #define SYSLOG_TAG "<6>" int linux_syslog(struct thread *td, struct linux_syslog_args *args) { char buf[128], *src, *dst; u_int seq; int buflen, error; if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) { linux_msg(td, "syslog unsupported type 0x%x", args->type); return (EINVAL); } if (args->len < 6) { td->td_retval[0] = 0; return (0); } error = priv_check(td, PRIV_MSGBUF); if (error) return (error); mtx_lock(&msgbuf_lock); msgbuf_peekbytes(msgbufp, NULL, 0, &seq); mtx_unlock(&msgbuf_lock); dst = args->buf; error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG)); /* The -1 is to skip the trailing '\0'. */ dst += sizeof(SYSLOG_TAG) - 1; while (error == 0) { mtx_lock(&msgbuf_lock); buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq); mtx_unlock(&msgbuf_lock); if (buflen == 0) break; for (src = buf; src < buf + buflen && error == 0; src++) { if (*src == '\0') continue; if (dst >= args->buf + args->len) goto out; error = copyout(src, dst, 1); dst++; if (*src == '\n' && *(src + 1) != '<' && dst + sizeof(SYSLOG_TAG) < args->buf + args->len) { error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG)); dst += sizeof(SYSLOG_TAG) - 1; } } } out: td->td_retval[0] = dst - args->buf; return (error); } int linux_getcpu(struct thread *td, struct linux_getcpu_args *args) { int cpu, error, node; cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */ error = 0; node = cpuid_to_pcpu[cpu]->pc_domain; if (args->cpu != NULL) error = copyout(&cpu, args->cpu, sizeof(l_int)); if (args->node != NULL) error = copyout(&node, args->node, sizeof(l_int)); return (error); } #if defined(__i386__) || defined(__amd64__) int linux_poll(struct thread *td, struct linux_poll_args *args) { struct timespec ts, *tsp; if (args->timeout != INFTIM) { if (args->timeout < 0) return (EINVAL); ts.tv_sec = args->timeout / 1000; ts.tv_nsec = (args->timeout % 1000) * 1000000; tsp = &ts; } else tsp = NULL; return (linux_common_ppoll(td, args->fds, args->nfds, tsp, NULL, 0)); } #endif /* __i386__ || __amd64__ */ int linux_seccomp(struct thread *td, struct linux_seccomp_args *args) { switch (args->op) { case LINUX_SECCOMP_GET_ACTION_AVAIL: return (EOPNOTSUPP); default: /* * Ignore unknown operations, just like Linux kernel built * without CONFIG_SECCOMP. */ return (EINVAL); } }