diff --git a/lib/libc/sys/procctl.2 b/lib/libc/sys/procctl.2 index 432ed5919a81..ce7a2be5d5e4 100644 --- a/lib/libc/sys/procctl.2 +++ b/lib/libc/sys/procctl.2 @@ -1,771 +1,781 @@ .\" Copyright (c) 2013 Hudson River Trading LLC .\" Written by: John H. Baldwin .\" All rights reserved. .\" .\" Copyright (c) 2014 The FreeBSD Foundation .\" Portions of this documentation were written by Konstantin Belousov .\" under sponsorship from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd July 1, 2021 .Dt PROCCTL 2 .Os .Sh NAME .Nm procctl .Nd control processes .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/procctl.h .Ft int .Fn procctl "idtype_t idtype" "id_t id" "int cmd" "void *data" .Sh DESCRIPTION The .Fn procctl system call provides for control over processes. The .Fa idtype and .Fa id arguments specify the set of processes to control. If multiple processes match the identifier, .Nm will make a .Dq best effort to control as many of the selected processes as possible. An error is only returned if no selected processes successfully complete the request. The following identifier types are supported: .Bl -tag -width P_PGID .It Dv P_PID Control the process with the process ID .Fa id . .It Dv P_PGID Control processes belonging to the process group with the ID .Fa id . .El .Pp The control request to perform is specified by the .Fa cmd argument. The following commands are supported: .Bl -tag -width PROC_TRAPCAP_STATUS .It Dv PROC_ASLR_CTL Controls the Address Space Layout Randomization (ASLR) in the program images created by .Xr execve 2 in the specified process or its descendants that did not changed the control nor modified it by other means. The .Fa data parameter must point to the integer variable holding one of the following values: .Bl -tag -width PROC_ASLR_FORCE_DISABLE .It Dv PROC_ASLR_FORCE_ENABLE Request that ASLR is enabled after execution, even if it is disabled system-wide. The image flag and set-uid might prevent ASLR enablement still. .It Dv PROC_ASLR_FORCE_DISABLE Request that ASLR is disabled after execution. Same notes as for .Dv PROC_ASLR_FORCE_ENABLE apply. .It Dv PROC_ASLR_NOFORCE Use the system-wide configured policy for ASLR. .El .It Dv PROC_ASLR_STATUS Returns the current status of ASLR enablement for the target process. The .Fa data parameter must point to the integer variable, where one of the following values is written: .Bl -tag -width PROC_ASLR_FORCE_DISABLE .It Dv PROC_ASLR_FORCE_ENABLE .It Dv PROC_ASLR_FORCE_DISABLE .It Dv PROC_ASLR_NOFORCE .El .Pp If the currently executed image in the process itself has ASLR enabled, the .Dv PROC_ASLR_ACTIVE flag is or-ed with the value listed above. .It Dv PROC_PROTMAX_CTL Controls implicit application of PROT_MAX protection equal to the .Fa prot argument of the .Xr mmap 2 syscall, in the target process. The .Fa data parameter must point to the integer variable holding one of the following values: .Bl -tag -width PROC_PROTMAX_FORCE_DISABLE .It Dv PROC_PROTMAX_FORCE_ENABLE Enables implicit PROT_MAX application, even if it is disabled system-wide by the sysctl .Va vm.imply_prot_max . The image flag might still prevent the enablement. .It Dv PROC_PROTMAX_FORCE_DISABLE Request that implicit application of PROT_MAX be disabled. Same notes as for .Dv PROC_PROTMAX_FORCE_ENABLE apply. .It Dv PROC_PROTMAX_NOFORCE Use the system-wide configured policy for PROT_MAX. .El .It Dv PROC_PROTMAX_STATUS Returns the current status of implicit PROT_MAX enablement for the target process. The .Fa data parameter must point to the integer variable, where one of the following values is written: .Bl -tag -width PROC_PROTMAX_FORCE_DISABLE .It Dv PROC_PROTMAX_FORCE_ENABLE .It Dv PROC_PROTMAX_FORCE_DISABLE .It Dv PROC_PROTMAX_NOFORCE .El .Pp If the currently executed image in the process itself has implicit PROT_MAX application enabled, the .Dv PROC_PROTMAX_ACTIVE flag is or-ed with the value listed above. .It Dv PROC_SPROTECT Set process protection state. This is used to mark a process as protected from being killed if the system exhausts the available memory and swap. The .Fa data parameter must point to an integer containing an operation and zero or more optional flags. The following operations are supported: .Bl -tag -width PPROT_CLEAR .It Dv PPROT_SET Mark the selected processes as protected. .It Dv PPROT_CLEAR Clear the protected state of selected processes. .El .Pp The following optional flags are supported: .Bl -tag -width PPROT_DESCEND .It Dv PPROT_DESCEND Apply the requested operation to all child processes of each selected process in addition to each selected process. .It Dv PPROT_INHERIT When used with .Dv PPROT_SET , mark all future child processes of each selected process as protected. Future child processes will also mark all of their future child processes. .El .It Dv PROC_REAP_ACQUIRE Acquires the reaper status for the current process. Reaper status means that children orphaned by the reaper's descendants that were forked after the acquisition of reaper status are reparented to the reaper process. After system initialization, .Xr init 8 is the default reaper. .It Dv PROC_REAP_RELEASE Release the reaper state for the current process. The reaper of the current process becomes the new reaper of the current process's descendants. .It Dv PROC_REAP_STATUS Provides information about the reaper of the specified process, or the process itself when it is a reaper. The .Fa data argument must point to a .Vt procctl_reaper_status structure which is filled in by the syscall on successful return. .Bd -literal struct procctl_reaper_status { u_int rs_flags; u_int rs_children; u_int rs_descendants; pid_t rs_reaper; pid_t rs_pid; }; .Ed The .Fa rs_flags may have the following flags returned: .Bl -tag -width REAPER_STATUS_REALINIT .It Dv REAPER_STATUS_OWNED The specified process has acquired reaper status and has not released it. When the flag is returned, the specified process .Fa id , pid, identifies the reaper, otherwise the .Fa rs_reaper field of the structure is set to the pid of the reaper for the specified process id. .It Dv REAPER_STATUS_REALINIT The specified process is the root of the reaper tree, i.e., .Xr init 8 . .El .Pp The .Fa rs_children field returns the number of children of the reaper among the descendants. It is possible to have a child whose reaper is not the specified process, since the reaper for any existing children is not reset on the .Dv PROC_REAP_ACQUIRE operation. The .Fa rs_descendants field returns the total number of descendants of the reaper(s), not counting descendants of the reaper in the subtree. The .Fa rs_reaper field returns the reaper pid. The .Fa rs_pid returns the pid of one reaper child if there are any descendants. .It Dv PROC_REAP_GETPIDS Queries the list of descendants of the reaper of the specified process. The request takes a pointer to a .Vt procctl_reaper_pids structure in the .Fa data parameter. .Bd -literal struct procctl_reaper_pids { u_int rp_count; struct procctl_reaper_pidinfo *rp_pids; }; .Ed When called, the .Fa rp_pids field must point to an array of .Vt procctl_reaper_pidinfo structures, to be filled in on return, and the .Fa rp_count field must specify the size of the array, into which no more than .Fa rp_count elements will be filled in by the kernel. .Pp The .Vt "struct procctl_reaper_pidinfo" structure provides some information about one of the reaper's descendants. Note that for a descendant that is not a child, it may be incorrectly identified because of a race in which the original child process exited and the exited process's pid was reused for an unrelated process. .Bd -literal struct procctl_reaper_pidinfo { pid_t pi_pid; pid_t pi_subtree; u_int pi_flags; }; .Ed The .Fa pi_pid field is the process id of the descendant. The .Fa pi_subtree field provides the pid of the child of the reaper, which is the (grand-)parent of the process. The .Fa pi_flags field returns the following flags, further describing the descendant: .Bl -tag -width REAPER_PIDINFO_REAPER .It Dv REAPER_PIDINFO_VALID Set to indicate that the .Vt procctl_reaper_pidinfo structure was filled in by the kernel. Zero-filling the .Fa rp_pids array and testing the .Dv REAPER_PIDINFO_VALID flag allows the caller to detect the end of the returned array. .It Dv REAPER_PIDINFO_CHILD The .Fa pi_pid field identifies the direct child of the reaper. .It Dv REAPER_PIDINFO_REAPER The reported process is itself a reaper. The descendants of the subordinate reaper are not reported. .El .It Dv PROC_REAP_KILL Request to deliver a signal to some subset of the descendants of the reaper. The .Fa data parameter must point to a .Vt procctl_reaper_kill structure, which is used both for parameters and status return. .Bd -literal struct procctl_reaper_kill { int rk_sig; u_int rk_flags; pid_t rk_subtree; u_int rk_killed; pid_t rk_fpid; }; .Ed The .Fa rk_sig field specifies the signal to be delivered. Zero is not a valid signal number, unlike for .Xr kill 2 . The .Fa rk_flags field further directs the operation. It is or-ed from the following flags: .Bl -tag -width REAPER_KILL_CHILDREN .It Dv REAPER_KILL_CHILDREN Deliver the specified signal only to direct children of the reaper. .It Dv REAPER_KILL_SUBTREE Deliver the specified signal only to descendants that were forked by the direct child with pid specified in the .Fa rk_subtree field. .El If neither the .Dv REAPER_KILL_CHILDREN nor the .Dv REAPER_KILL_SUBTREE flags are specified, all current descendants of the reaper are signalled. .Pp If a signal was delivered to any process, the return value from the request is zero. In this case, the .Fa rk_killed field identifies the number of processes signalled. The .Fa rk_fpid field is set to the pid of the first process for which signal delivery failed, e.g., due to permission problems. If no such process exists, the .Fa rk_fpid field is set to -1. .It Dv PROC_TRACE_CTL Enable or disable tracing of the specified process(es), according to the value of the integer argument. Tracing includes attachment to the process using the .Xr ptrace 2 and .Xr ktrace 2 , debugging sysctls, .Xr hwpmc 4 , .Xr dtrace 1 , and core dumping. Possible values for the .Fa data argument are: .Bl -tag -width PROC_TRACE_CTL_DISABLE_EXEC .It Dv PROC_TRACE_CTL_ENABLE Enable tracing, after it was disabled by .Dv PROC_TRACE_CTL_DISABLE . Only allowed for self. .It Dv PROC_TRACE_CTL_DISABLE Disable tracing for the specified process. Tracing is re-enabled when the process changes the executing program with the .Xr execve 2 syscall. A child inherits the trace settings from the parent on .Xr fork 2 . .It Dv PROC_TRACE_CTL_DISABLE_EXEC Same as .Dv PROC_TRACE_CTL_DISABLE , but the setting persists for the process even after .Xr execve 2 . .El .It Dv PROC_TRACE_STATUS Returns the current tracing status for the specified process in the integer variable pointed to by .Fa data . If tracing is disabled, .Fa data is set to -1. If tracing is enabled, but no debugger is attached by the .Xr ptrace 2 syscall, .Fa data is set to 0. If a debugger is attached, .Fa data is set to the pid of the debugger process. .It Dv PROC_TRAPCAP_CTL Controls the capability mode sandbox actions for the specified sandboxed processes, on a return from any syscall which gives either a .Er ENOTCAPABLE or .Er ECAPMODE error. If the control is enabled, such errors from the syscalls cause delivery of the synchronous .Dv SIGTRAP signal to the thread immediately before returning from the syscalls. .Pp Possible values for the .Fa data argument are: .Bl -tag -width PROC_TRAPCAP_CTL_DISABLE .It Dv PROC_TRAPCAP_CTL_ENABLE Enable the .Dv SIGTRAP signal delivery on capability mode access violations. The enabled mode is inherited by the children of the process, and is kept after .Xr fexecve 2 calls. .It Dv PROC_TRAPCAP_CTL_DISABLE Disable the signal delivery on capability mode access violations. Note that the global sysctl .Dv kern.trap_enotcap might still cause the signal to be delivered. See .Xr capsicum 4 . .El .Pp On signal delivery, the .Va si_errno member of the .Fa siginfo signal handler parameter is set to the syscall error value, and the .Va si_code member is set to .Dv TRAP_CAP . +The system call number is stored in the +.Va si_syscall +field of the +.Fa siginfo +signal handler parameter. +The other system call parameters can be read from the +.Fa ucontext_t +but the system call number is typically stored in the register +that also contains the return value and so is unavailable in the +signal handler. .Pp See .Xr capsicum 4 for more information about the capability mode. .It Dv PROC_TRAPCAP_STATUS Return the current status of signalling capability mode access violations for the specified process. The integer value pointed to by the .Fa data argument is set to the .Dv PROC_TRAPCAP_CTL_ENABLE value if the process control enables signal delivery, and to .Dv PROC_TRAPCAP_CTL_DISABLE otherwise. .Pp See the note about sysctl .Dv kern.trap_enotcap above, which gives independent global control of signal delivery. .It Dv PROC_PDEATHSIG_CTL Request the delivery of a signal when the parent of the calling process exits. .Fa idtype must be .Dv P_PID and .Fa id must be the either caller's pid or zero, with no difference in effect. The value is cleared for child processes and when executing set-user-ID or set-group-ID binaries. .Fa data must point to a value of type .Vt int indicating the signal that should be delivered to the caller. Use zero to cancel a previously requested signal delivery. .It Dv PROC_PDEATHSIG_STATUS Query the current signal number that will be delivered when the parent of the calling process exits. .Fa idtype must be .Dv P_PID and .Fa id must be the either caller's pid or zero, with no difference in effect. .Fa data must point to a memory location that can hold a value of type .Vt int . If signal delivery has not been requested, it will contain zero on return. .It Dv PROC_STACKGAP_CTL Controls the stack gaps in the specified process. A stack gap is the part of the growth area for a .Dv MAP_STACK mapped region that is reserved and never filled by memory. Instead, the process is guaranteed to receive a .Dv SIGSEGV signal on accessing pages in the gap. Gaps protect against stack overflow corrupting memory adjacent to the stack. .Pp The .Fa data argument must point to an integer variable containing flags. The following flags are allowed: .Bl -tag -width PROC_STACKGAP_DISABLE_EXEC .It Dv PROC_STACKGAP_ENABLE This flag is only accepted for consistency with .Dv PROC_STACKGAP_STATUS . If stack gaps are enabled, the flag is ignored. If disabled, the flag causes an .Ev EINVAL error to be returned. After gaps are disabled in a process, they can only be re-enabled when an .Xr execve 2 is performed. .It Dv PROC_STACKGAP_DISABLE Disable stack gaps for the process. For existing stacks, the gap is no longer a reserved part of the growth area and can be filled by memory on access. .It Dv PROC_STACKGAP_ENABLE_EXEC Enable stack gaps for programs started after an .Xr execve 2 by the specified process. .It Dv PROC_STACKGAP_DISABLE_EXEC Inherit disabled stack gaps state after .Xr execve 2 . In other words, if the currently executing program has stack gaps disabled, they are kept disabled on exec. If gaps were enabled, they are kept enabled after exec. .El .Pp The stack gap state is inherited from the parent on .Xr fork 2 . .It Dv PROC_STACKGAP_STATUS Returns the current stack gap state for the specified process. .Fa data must point to an integer variable, which is used to return a bitmask consisting of the following flags: .Bl -tag -width PROC_STACKGAP_DISABLE_EXEC .It Dv PROC_STACKGAP_ENABLE Stack gaps are enabled. .It Dv PROC_STACKGAP_DISABLE Stack gaps are disabled. .It Dv PROC_STACKGAP_ENABLE_EXEC Stack gaps are enabled in the process after .Xr execve 2 . .It Dv PROC_STACKGAP_DISABLE_EXEC Stack gaps are disabled in the process after .Xr execve 2 . .El .It Dv PROC_NO_NEW_PRIVS_CTL Allows one to ignore the SUID and SGID bits on the program images activated by .Xr execve 2 in the specified process and its future descendants. The .Fa data parameter must point to the integer variable holding the following value: .Bl -tag -width PROC_NO_NEW_PRIVS_ENABLE .It Dv PROC_NO_NEW_PRIVS_ENABLE Request SUID and SGID bits to be ignored. .El .Pp It is not possible to disable it once it has been enabled. .It Dv PROC_NO_NEW_PRIVS_STATUS Returns the current status of SUID/SGID enablement for the target process. The .Fa data parameter must point to the integer variable, where one of the following values is written: .Bl -tag -width PROC_NO_NEW_PRIVS_DISABLE .It Dv PROC_NO_NEW_PRIVS_ENABLE .It Dv PROC_NO_NEW_PRIVS_DISABLE .El .El .Sh x86 MACHINE-SPECIFIC REQUESTS .Bl -tag -width PROC_KPTI_STATUS .It Dv PROC_KPTI_CTL AMD64 only. Controls the Kernel Page Table Isolation (KPTI) option for the children of the specified process. For the command to work, the .Va vm.pmap.kpti tunable must be enabled on boot. It is not possible to change the KPTI setting for a running process, except at the .Xr execve 2 , where the address space is reinitialized. .Pp The .Fa data parameter must point to an integer variable containing one of the following commands: .Bl -tag -width PROC_KPTI_CTL_DISABLE_ON_EXEC .It Dv PROC_KPTI_CTL_ENABLE_ON_EXEC Enable KPTI after .Xr execve 2 . .It Dv PROC_KPTI_CTL_DISABLE_ON_EXEC Disable KPTI after .Xr execve 2 . Only root or a process having the .Va PRIV_IO privilege might use this option. .El .It Dv PROC_KPTI_STATUS Returns the current KPTI status for the specified process. .Fa data must point to the integer variable, which returns the following statuses: .Bl -tag -width PROC_KPTI_CTL_DISABLE_ON_EXEC .It Dv PROC_KPTI_CTL_ENABLE_ON_EXEC .It Dv PROC_KPTI_CTL_DISABLE_ON_EXEC .El .Pp The status is or-ed with the .Va PROC_KPTI_STATUS_ACTIVE in case KPTI is active for the current address space of the process. .Sh NOTES Disabling tracing on a process should not be considered a security feature, as it is bypassable both by the kernel and privileged processes, and via other system mechanisms. As such, it should not be utilized to reliably protect cryptographic keying material or other confidential data. .Sh RETURN VALUES If an error occurs, a value of -1 is returned and .Va errno is set to indicate the error. .Sh ERRORS The .Fn procctl system call will fail if: .Bl -tag -width Er .It Bq Er EFAULT The .Fa data parameter points outside the process's allocated address space. .It Bq Er EINVAL The .Fa cmd argument specifies an unsupported command. .Pp The .Fa idtype argument specifies an unsupported identifier type. .It Bq Er EPERM The calling process does not have permission to perform the requested operation on any of the selected processes. .It Bq Er ESRCH No processes matched the requested .Fa idtype and .Fa id . .It Bq Er EINVAL An invalid operation or flag was passed in .Fa data for a .Dv PROC_SPROTECT command. .It Bq Er EPERM The .Fa idtype argument is not equal to .Dv P_PID , or .Fa id is not equal to the pid of the calling process, for .Dv PROC_REAP_ACQUIRE or .Dv PROC_REAP_RELEASE requests. .It Bq Er EINVAL Invalid or undefined flags were passed to a .Dv PROC_REAP_KILL request. .It Bq Er EINVAL An invalid or zero signal number was requested for a .Dv PROC_REAP_KILL request. .It Bq Er EINVAL The .Dv PROC_REAP_RELEASE request was issued by the .Xr init 8 process. .It Bq Er EBUSY The .Dv PROC_REAP_ACQUIRE request was issued by a process that had already acquired reaper status and has not yet released it. .It Bq Er EBUSY The .Dv PROC_TRACE_CTL request was issued for a process already being traced. .It Bq Er EPERM The .Dv PROC_TRACE_CTL request to re-enable tracing of the process .Po Dv PROC_TRACE_CTL_ENABLE Pc , or to disable persistence of .Dv PROC_TRACE_CTL_DISABLE on .Xr execve 2 was issued for a non-current process. .It Bq Er EINVAL The value of the integer .Fa data parameter for the .Dv PROC_TRACE_CTL or .Dv PROC_TRAPCAP_CTL request is invalid. .It Bq Er EINVAL The .Dv PROC_PDEATHSIG_CTL or .Dv PROC_PDEATHSIG_STATUS request referenced an unsupported .Fa id , .Fa idtype or invalid signal number. .El .Sh SEE ALSO .Xr dtrace 1 , .Xr proccontrol 1 , .Xr protect 1 , .Xr cap_enter 2 , .Xr kill 2 , .Xr ktrace 2 , .Xr mmap 2 , .Xr mprotect 2 , .Xr ptrace 2 , .Xr wait 2 , .Xr capsicum 4 , .Xr hwpmc 4 , .Xr init 8 .Sh HISTORY The .Fn procctl function appeared in .Fx 10.0 . .Pp The reaper facility is based on a similar feature of Linux and DragonflyBSD, and first appeared in .Fx 10.2 . .Pp The .Dv PROC_PDEATHSIG_CTL facility is based on the prctl(PR_SET_PDEATHSIG, ...) feature of Linux, and first appeared in .Fx 11.2 . .Pp The ASLR support was added to system for the checklists compliance in .Fx 13.0 . diff --git a/share/man/man3/siginfo.3 b/share/man/man3/siginfo.3 index acc8785b2f0d..7f8a809cdfa5 100644 --- a/share/man/man3/siginfo.3 +++ b/share/man/man3/siginfo.3 @@ -1,348 +1,350 @@ .\" Copyright (c) 2005 David Xu .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in .\" the documentation and/or other materials provided with the .\" distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR .\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR .\" BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, .\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE .\" OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, .\" EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd February 17, 2021 .Dt SIGINFO 3 .Os .Sh NAME .Nm siginfo .Nd "signal generation information" .Sh SYNOPSIS .In signal.h .Sh DESCRIPTION A process may request signal information when it is catching a signal. The information specifies why the system generated that signal. To request signal information in a signal handler, the user can set .Dv SA_SIGINFO in .Va sa_flags before .Xr sigaction 2 is called, otherwise the user can use .Xr sigwaitinfo 2 and .Xr sigtimedwait 2 to get signal information. In either case, the system returns the information in a structure of type .Vt siginfo_t , which includes the following information: .Bl -column ".Vt union signal" ".Va si_overrun" .It Sy Type Ta Sy Member Ta Sy Description .It Vt int Ta Va si_signo Ta signal number .It Vt int Ta Va si_errno Ta error number .It Vt int Ta Va si_code Ta signal code .It Vt union sigval Ta Va si_value Ta signal value .It Vt pid_t Ta Va si_pid Ta sending process ID .It Vt uid_t Ta Va si_uid Ta sending process's real user ID .It Vt void Ta Va *si_addr Ta virtual address .It Vt int Ta Va si_status Ta exit value or signal .It Vt long Ta Va si_band Ta band event for .Dv SIGPOLL .It Vt int Ta Va si_trapno Ta machine trap code .It Vt int Ta Va si_timerid Ta .Tn POSIX timer ID .It Vt int Ta Va si_overrun Ta .Tn POSIX timer overrun count .It Vt int Ta Va si_mqd Ta .Tn POSIX message queue ID +.It Vt int Ta Va si_syscall Ta +system-call number for system calls blocked by Capsicum .El .Pp The .Va si_signo member contains the signal number. .Pp The .Va si_errno member contains an error number defined in the file .In errno.h . .Pp The .Va si_code member contains a code which describes the cause of the signal. The macros specified in the .Sy Code column of the following table are defined for use as values of .Va si_code that are signal-specific or non-signal-specific reasons why the signal was generated: .Bl -column ".Dv SIGPOLL" ".Dv CLD_CONTINUED" .It Sy Signal Ta Sy Code Ta Sy Reason .It Dv SIGILL Ta Dv ILL_ILLOPC Ta illegal opcode .It Ta Dv ILL_ILLOPN Ta illegal operand .It Ta Dv ILL_ILLADR Ta illegal addressing mode .It Ta Dv ILL_ILLTRP Ta illegal trap .It Ta Dv ILL_PRVOPC Ta illegal privileged opcode .It Ta Dv ILL_PRVREG Ta illegal privileged register .It Ta Dv ILL_COPROC Ta coprocessor error .It Ta Dv ILL_BADSTK Ta internal stack error .It Dv SIGFPE Ta Dv FPE_INTDIV Ta integer divide by zero .It Ta Dv FPE_INTOVF Ta integer overflow .It Ta Dv FPE_FLTDIV Ta floating-point divide by zero .It Ta Dv FPE_FLTOVF Ta floating-point overflow .It Ta Dv FPE_FLTUND Ta floating-point underflow .It Ta Dv FPE_FLTRES Ta floating-point inexact result .It Ta Dv FPE_FLTINV Ta invalid floating-point operation .It Ta Dv FPE_FLTSUB Ta subscript out of range .It Dv SIGSEGV Ta Dv SEGV_MAPERR Ta address not mapped to object .It Ta Dv SEGV_ACCERR Ta invalid permissions for mapped object .It Dv SIGBUS Ta Dv BUS_ADRALN Ta invalid address alignment .It Ta Dv BUS_ADRERR Ta nonexistent physical address .It Ta Dv BUS_OBJERR Ta object-specific hardware error .It Ta Dv BUS_OOMERR Ta cannot alloc a page to map at fault .It Dv SIGTRAP Ta Dv TRAP_BRKPT Ta process breakpoint .It Ta Dv TRAP_TRACE Ta process trace trap .It Ta Dv TRAP_DTRACE Ta DTrace induced trap .It Ta Dv TRAP_CAP Ta capabilities protective trap .It Dv SIGCHLD Ta Dv CLD_EXITED Ta child has exited .It Ta Dv CLD_KILLED Ta child has terminated abnormally and did not create a core file .It Ta Dv CLD_DUMPED Ta child has terminated abnormally and created a core file .It Ta Dv CLD_TRAPPED Ta traced child has trapped .It Ta Dv CLD_STOPPED Ta child has stopped .It Ta Dv CLD_CONTINUED Ta stopped child has continued .It Dv SIGPOLL Ta Dv POLL_IN Ta data input available .It Ta Dv POLL_OUT Ta output buffers available .It Ta Dv POLL_MSG Ta input message available .It Ta Dv POLL_ERR Ta I/O error .It Ta Dv POLL_PRI Ta high priority input available .It Ta Dv POLL_HUP Ta device disconnected .It Any Ta Dv SI_NOINFO Ta Only the .Va si_signo member is meaningful; the value of all other members is unspecified. .It Ta Dv SI_USER Ta signal sent by .Xr kill 2 .It Ta Dv SI_QUEUE Ta signal sent by .Xr sigqueue 2 .It Ta Dv SI_TIMER Ta signal generated by expiration of a timer set by .Xr timer_settime 2 .It Ta Dv SI_ASYNCIO Ta signal generated by completion of an asynchronous I/O request .It Ta Dv SI_MESGQ Ta signal generated by arrival of a message on an empty message queue .It Ta Dv SI_KERNEL Ta signal generated by miscellaneous parts of the kernel .It Ta Dv SI_LWP Ta signal sent by .Xr pthread_kill 3 .El .Pp For synchronous signals, .Va si_addr is generally set to the address of the faulting instruction. However, synchronous signals raised by a faulting memory access such as .Dv SIGSEGV and .Dv SIGBUS may report the address of the faulting memory access (if available) in .Va si_addr instead. Additionally .Dv SIGTRAP raised by a hardware watchpoint exception may report the data address that triggered the watchpoint in .Va si_addr . .Pp Sychronous signals set .Va si_trapno to a machine-dependent trap number. .Pp In addition, the following signal-specific information is available: .Bl -column ".Dv SIGPOLL" ".Dv CLD_CONTINUED" .It Sy Signal Ta Sy Member Ta Sy Value .It Dv SIGCHLD Ta Va si_pid Ta child process ID .It Ta Va si_status Ta exit value or signal; if .Va si_code is equal to .Dv CLD_EXITED , then it is equal to the exit value of the child process, otherwise, it is equal to a signal that caused the child process to change state. .It Ta Va si_uid Ta "real user ID of the process that sent the signal" .It Dv SIGPOLL Ta Va si_band Ta "band event for" .Dv POLL_IN , POLL_OUT , or .Dv POLL_MSG .El .Pp Finally, the following code-specific information is available: .Bl -column ".Dv SI_ASYNCIO" ".Va si_overrun" .It Sy Code Ta Sy Member Ta Sy Value .It Dv SI_USER Ta Va si_pid Ta the process ID that sent the signal .It Ta Va si_uid Ta real user ID of the process that sent the signal .It Dv SI_QUEUE Ta Va si_value Ta the value passed to .Xr sigqueue 2 system call .It Ta Va si_pid Ta the process ID that sent the signal .It Ta Va si_uid Ta real user ID of the process that sent the signal .It Dv SI_TIMER Ta Va si_value Ta the value passed to .Xr timer_create 2 system call .It Ta Va si_timerid Ta the timer ID returned by .Xr timer_create 2 system call .It Ta Va si_overrun Ta timer overrun count corresponding to the signal .It Ta Va si_errno Ta If timer overrun will be .Brq Dv DELAYTIMER_MAX , an error code defined in .In errno.h is set .It Dv SI_ASYNCIO Ta Va si_value Ta the value passed to aio system calls .It Dv SI_MESGQ Ta Va si_value Ta the value passed to .Xr mq_notify 2 system call .It Ta Va si_mqd Ta the ID of the message queue which generated the signal .It Dv SI_LWP Ta Va si_pid Ta the process ID that sent the signal .It Ta Va si_uid Ta real user ID of the process that sent the signal .El .Sh NOTES Currently, the kernel never generates the .Dv SIGPOLL signal. .Dv SIGCHLD signal is queued when a process changed its status or exited. .Tn POSIX Realtime Extensions like aio, timer, and message queue also queue signals. Signals with code .Dv SI_USER , .Dv SI_KERNEL or .Dv SI_LWP are only queued if there are sufficient resources; otherwise, .Dv SI_NOINFO results. For some hardware architectures, the exact value of .Va si_addr might not be available. .Sh SEE ALSO .Xr aio_read 2 , .Xr kill 2 , .Xr mq_notify 2 , .Xr sigaction 2 , .Xr sigqueue 2 , .Xr sigwaitinfo 2 , .Xr timer_create 2 , .Xr timer_settime 2 , .Xr waitpid 2 , .Xr pthread_kill 3 .Sh STANDARDS The .Vt siginfo_t type conforms to .St -p1003.1-2004 . .Sh HISTORY Full support for .Tn POSIX signal information first appeared in .Fx 7.0 . The codes .Dv SI_USER and .Dv SI_KERNEL can be generated as of .Fx 8.1 . The code .Dv SI_LWP can be generated as of .Fx 9.0 . .Sh AUTHORS This manual page was written by .An David Xu Aq Mt davidxu@FreeBSD.org . diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 55649687ce50..b08495f3f139 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -1,1225 +1,1226 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include #endif extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg), IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall); void __noinline trap(struct trapframe *frame); void trap_check(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, bool, int *, int *); static void trap_fatal(struct trapframe *, vm_offset_t); #ifdef KDTRACE_HOOKS static bool trap_user_dtrace(struct trapframe *, int (**hook)(struct trapframe *)); #endif static const char UNKNOWN[] = "unknown"; static const char *const trap_msg[] = { [0] = UNKNOWN, /* unused */ [T_PRIVINFLT] = "privileged instruction fault", [2] = UNKNOWN, /* unused */ [T_BPTFLT] = "breakpoint instruction fault", [4] = UNKNOWN, /* unused */ [5] = UNKNOWN, /* unused */ [T_ARITHTRAP] = "arithmetic trap", [7] = UNKNOWN, /* unused */ [8] = UNKNOWN, /* unused */ [T_PROTFLT] = "general protection fault", [T_TRCTRAP] = "debug exception", [11] = UNKNOWN, /* unused */ [T_PAGEFLT] = "page fault", [13] = UNKNOWN, /* unused */ [T_ALIGNFLT] = "alignment fault", [15] = UNKNOWN, /* unused */ [16] = UNKNOWN, /* unused */ [17] = UNKNOWN, /* unused */ [T_DIVIDE] = "integer divide fault", [T_NMI] = "non-maskable interrupt trap", [T_OFLOW] = "overflow trap", [T_BOUND] = "FPU bounds check fault", [T_DNA] = "FPU device not available", [T_DOUBLEFLT] = "double fault", [T_FPOPFLT] = "FPU operand fetch fault", [T_TSSFLT] = "invalid TSS fault", [T_SEGNPFLT] = "segment not present fault", [T_STKFLT] = "stack fault", [T_MCHK] = "machine check trap", [T_XMMFLT] = "SIMD floating-point exception", [T_RESERVED] = "reserved (unknown) fault", [31] = UNKNOWN, /* reserved */ [T_DTRACE_RET] = "DTrace pid return trap", }; static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Control L1D flush on return from NMI. * * Tunable can be set to the following values: * 0 - only enable flush on return from NMI if required by vmm.ko (default) * >1 - always flush on return from NMI. * * Post-boot, the sysctl indicates if flushing is currently enabled. */ int nmi_flush_l1d_sw; SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN, &nmi_flush_l1d_sw, 0, "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist"); /* * Table of handlers for various segment load faults. */ static const struct { uintptr_t faddr; uintptr_t fhandler; } sfhandlers[] = { { .faddr = (uintptr_t)ld_ds, .fhandler = (uintptr_t)ds_load_fault, }, { .faddr = (uintptr_t)ld_es, .fhandler = (uintptr_t)es_load_fault, }, { .faddr = (uintptr_t)ld_fs, .fhandler = (uintptr_t)fs_load_fault, }, { .faddr = (uintptr_t)ld_gs, .fhandler = (uintptr_t)gs_load_fault, }, { .faddr = (uintptr_t)ld_gsbase, .fhandler = (uintptr_t)gsbase_load_fault }, { .faddr = (uintptr_t)ld_fsbase, .fhandler = (uintptr_t)fsbase_load_fault, }, }; /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { ksiginfo_t ksi; struct thread *td; struct proc *p; register_t addr, dr6; size_t i; int pf, signo, ucode; u_int type; td = curthread; p = td->td_proc; dr6 = 0; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI && ipi_nmi_handler() == 0) return; #endif #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); return; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(frame) != 0) return; #endif } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) { uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); } else { switch (type) { case T_NMI: case T_BPTFLT: case T_TRCTRAP: case T_PROTFLT: case T_SEGNPFLT: case T_STKFLT: break; default: printf( "kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ signo = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ #ifdef KDTRACE_HOOKS if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr)) return; #else enable_intr(); #endif signo = SIGTRAP; ucode = TRAP_BRKPT; break; case T_TRCTRAP: /* debug exception */ enable_intr(); signo = SIGTRAP; ucode = TRAP_TRACE; dr6 = rdr6(); if ((dr6 & DBREG_DR6_BS) != 0) { PROC_LOCK(td->td_proc); if ((td->td_dbgflags & TDB_STEP) != 0) { td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; } PROC_UNLOCK(td->td_proc); } break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) return; signo = SIGFPE; break; case T_PROTFLT: /* general protection fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ signo = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: signo = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: signo = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Can emulator handle this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) return; pf = trap_pfault(frame, true, &signo, &ucode); if (pf == -1) return; if (pf == 0) goto userret; addr = frame->tf_addr; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; signo = SIGFPE; break; case T_NMI: nmi_handle_intr(type, frame); return; case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; signo = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; signo = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); return; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; signo = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) return; signo = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr); return; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void)trap_pfault(frame, false, NULL, NULL); return; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); return; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); return; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. * * In case of PTI, the IRETQ faulted while the * kernel used the pti stack, and exception * frame records %rsp value pointing to that * stack. If we return normally to * doreti_iret_fault, the trapframe is * reconstructed on pti stack, and calltrap() * called on it as well. Due to the very * limited pti stack size, kernel does not * survive for too long. Switch to the normal * thread stack for the trap handling. * * Magic '5' is the number of qwords occupied by * the hardware trap frame. */ if (frame->tf_rip == (long)doreti_iret) { KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); frame->tf_rip = (long)doreti_iret_fault; if ((PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3) && (frame->tf_rsp == (uintptr_t)PCPU_GET( pti_rsp0) - 5 * sizeof(register_t))) { frame->tf_rsp = PCPU_GET(rsp0) - 5 * sizeof(register_t); } return; } for (i = 0; i < nitems(sfhandlers); i++) { if (frame->tf_rip == sfhandlers[i].faddr) { KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); frame->tf_rip = sfhandlers[i].fhandler; return; } } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* debug exception */ /* Clear any pending debug events. */ dr6 = rdr6(); load_dr6(0); /* * Ignore debug register exceptions due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap(dr6)) return; /* * Malicious user code can configure a debug * register watchpoint to trap on data access * to the top of stack and then execute 'pop * %ss; int 3'. Due to exception deferral for * 'pop %ss', the CPU will not interrupt 'int * 3' to raise the DB# exception for the debug * register but will postpone the DB# until * execution of the first instruction of the * BP# handler (in kernel mode). Normally the * previous check would ignore DB# exceptions * for watchpoints on user addresses raised in * kernel mode. However, some CPU errata * include cases where DB# exceptions do not * properly set bits in %dr6, e.g. Haswell * HSD23 and Skylake-X SKZ24. * * A deferred DB# can also be raised on the * first instructions of system call entry * points or single-step traps via similar use * of 'pop %ss' or 'mov xxx, %ss'. */ if (pti) { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall_pti) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall_pti) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti)) return; } else { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt)) return; } if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) || /* Needed for AMD. */ frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32)) return; /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, dr6, frame)) return; #endif break; case T_NMI: nmi_handle_intr(type, frame); return; } trap_fatal(frame, 0); return; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap != NULL) signo = (*p->p_sysent->sv_transtrap)(signo, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); userret: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static bool trap_is_smap(struct trapframe *frame) { /* * A page fault on a userspace address is classified as * SMAP-induced if: * - SMAP is supported; * - kernel mode accessed present data page; * - rflags.AC was cleared. * Kernel must never access user space with rflags.AC cleared * if SMAP is enabled. */ return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 && (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) == PGEX_P && (frame->tf_rflags & PSL_AC) == 0); } static bool trap_is_pti(struct trapframe *frame) { return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) == (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)); } /* * Handle all details of a page fault. * Returns: * -1 if this fault was fatal, typically from kernel mode * (cannot happen, but we need to return something). * 0 if this fault was handled by updating either the user or kernel * page table, execution can continue. * 1 if this fault was from usermode and it was not handled, a synchronous * signal should be delivered to the thread. *signo returns the signal * number, *ucode gives si_code. */ static int trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode) { struct thread *td; struct proc *p; vm_map_t map; vm_offset_t eva; int rv; vm_prot_t ftype; MPASS(!usermode || (signo != NULL && ucode != NULL)); td = curthread; p = td->td_proc; eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } if (eva >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) { *signo = SIGSEGV; *ucode = SEGV_MAPERR; return (1); } map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. * * If SMAP is enabled, filter SMAP faults also, * because illegal access might occur to the mapped * user address, causing infinite loop. */ if (!usermode && (td->td_intr_nesting_level != 0 || trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * User-mode protection key violation (PKU). May happen * either from usermode or from kernel if copyin accessed * key-protected mapping. */ if ((frame->tf_err & PGEX_PK) != 0) { if (eva > VM_MAXUSER_ADDRESS) { trap_fatal(frame, eva); return (-1); } if (usermode) { *signo = SIGSEGV; *ucode = SEGV_PKUERR; return (1); } goto after_vmfault; } /* * If nx protection of the usermode portion of kernel page * tables caused trap, panic. */ if (usermode && trap_is_pti(frame)) panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } if (usermode) return (1); after_vmfault: if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } static void trap_fatal(struct trapframe *frame, vm_offset_t eva) { int code, ss; u_int type; struct soft_segment_descriptor softseg; struct user_segment_descriptor *gdt; #ifdef KDB bool handled; #endif code = frame->tf_err; type = frame->tf_trapno; gdt = *PCPU_PTR(gdt); sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); printf("\n\nFatal trap %d: %s while in %s mode\n", type, type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_PK ? " prot key" : "", code & PGEX_SGX ? " SGX" : "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(type, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif printf("trap number = %d\n", type); panic("%s", type < nitems(trap_msg) ? trap_msg[type] : "unknown/reserved trap"); } #ifdef KDTRACE_HOOKS /* * Invoke a userspace DTrace hook. The hook pointer is cleared when no * userspace probes are enabled, so we must synchronize with DTrace to ensure * that a trapping thread is able to call the hook before it is cleared. */ static bool trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *)) { int (*hook)(struct trapframe *); hook = atomic_load_ptr(hookp); enable_intr(); if (hook != NULL) return ((hook)(frame) == 0); return (false); } #endif /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n" "rip %#lx rsp %#lx rbp %#lx\n" "rax %#lx rdx %#lx rbx %#lx\n" "rcx %#lx rsi %#lx rdi %#lx\n" "r8 %#lx r9 %#lx r10 %#lx\n" "r11 %#lx r12 %#lx r13 %#lx\n" "r14 %#lx r15 %#lx rflags %#lx\n" "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n" "fsbase %#lx gsbase %#lx kgsbase %#lx\n", frame->tf_rip, frame->tf_rsp, frame->tf_rbp, frame->tf_rax, frame->tf_rdx, frame->tf_rbx, frame->tf_rcx, frame->tf_rdi, frame->tf_rsi, frame->tf_r8, frame->tf_r9, frame->tf_r10, frame->tf_r11, frame->tf_r12, frame->tf_r13, frame->tf_r14, frame->tf_r15, frame->tf_rflags, frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, frame->tf_fs, frame->tf_gs, rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } static int __noinline cpu_fetch_syscall_args_fallback(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; register_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = NARGREGS; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Too many syscall arguments!")); argp = &frame->tf_rdi; argp += reg; memcpy(sa->args, argp, sizeof(sa->args[0]) * NARGREGS); if (sa->callp->sy_narg > regcnt) { params = (caddr_t)frame->tf_rsp + sizeof(register_t); error = copyin(params, &sa->args[regcnt], (sa->callp->sy_narg - regcnt) * sizeof(sa->args[0])); if (__predict_false(error != 0)) return (error); } td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_rax; + sa->original_code = sa->code; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall || sa->code >= p->p_sysent->sv_size)) return (cpu_fetch_syscall_args_fallback(td, sa)); sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Too many syscall arguments!")); if (__predict_false(sa->callp->sy_narg > NARGREGS)) return (cpu_fetch_syscall_args_fallback(td, sa)); memcpy(sa->args, &frame->tf_rdi, sizeof(sa->args[0]) * NARGREGS); td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } #include "../../kern/subr_syscall.c" static void (*syscall_ret_l1d_flush)(void); int syscall_ret_l1d_flush_mode; static void flush_l1d_hw(void) { wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); } static void __noinline amd64_syscall_ret_flush_l1d_check(int error) { void (*p)(void); if (error != EEXIST && error != EAGAIN && error != EXDEV && error != ENOENT && error != ENOTCONN && error != EINPROGRESS) { p = atomic_load_ptr(&syscall_ret_l1d_flush); if (p != NULL) p(); } } static void __inline amd64_syscall_ret_flush_l1d_check_inline(int error) { if (__predict_false(error != 0)) amd64_syscall_ret_flush_l1d_check(error); } void amd64_syscall_ret_flush_l1d(int error) { amd64_syscall_ret_flush_l1d_check_inline(error); } void amd64_syscall_ret_flush_l1d_recalc(void) { bool l1d_hw; l1d_hw = (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0; again: switch (syscall_ret_l1d_flush_mode) { case 0: syscall_ret_l1d_flush = NULL; break; case 1: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : flush_l1d_sw_abi; break; case 2: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : NULL; break; case 3: syscall_ret_l1d_flush = flush_l1d_sw_abi; break; default: syscall_ret_l1d_flush_mode = 1; goto again; } } static int machdep_syscall_ret_flush_l1d(SYSCTL_HANDLER_ARGS) { int error, val; val = syscall_ret_l1d_flush_mode; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); syscall_ret_l1d_flush_mode = val; amd64_syscall_ret_flush_l1d_recalc(); return (0); } SYSCTL_PROC(_machdep, OID_AUTO, syscall_ret_flush_l1d, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, machdep_syscall_ret_flush_l1d, "I", "Flush L1D on syscall return with error (0 - off, 1 - on, " "2 - use hw only, 3 - use sw only)"); /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!TRAPF_USERMODE(td->td_frame)) { panic("syscall"); /* NOT REACHED */ } #endif syscallenter(td); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); KASSERT(pmap_not_in_di(), ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, td->td_sa.code), td->td_md.md_invl_gen.gen)); syscallret(td); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (__predict_false(td->td_frame->tf_rip >= (la57 ? VM_MAXUSER_ADDRESS_LA57 : VM_MAXUSER_ADDRESS_LA48))) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); amd64_syscall_ret_flush_l1d_check_inline(td->td_errno); } diff --git a/sys/amd64/cloudabi32/cloudabi32_sysvec.c b/sys/amd64/cloudabi32/cloudabi32_sysvec.c index 164f87e90e91..26924ed5a980 100644 --- a/sys/amd64/cloudabi32/cloudabi32_sysvec.c +++ b/sys/amd64/cloudabi32/cloudabi32_sysvec.c @@ -1,236 +1,237 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; extern unsigned long ia32_maxssiz; static int cloudabi32_fixup_tcb(uintptr_t *stack_base, struct image_params *imgp) { int error; uint32_t args[2]; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi32_fixup(stack_base, imgp); if (error != 0) return (error); /* * On i386, the TCB is referred to by %gs:0. Reuse the empty * space normally used by the return address (args[0]) to store * a single element array, containing a pointer to the TCB. %gs * base will point to this. * * Also let the first argument of the entry point (args[1]) * refer to the auxiliary vector, which is stored right after * the TCB. */ args[0] = *stack_base; args[1] = *stack_base + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); *stack_base -= roundup2(sizeof(args), sizeof(register_t)); return (copyout(args, (void *)*stack_base, sizeof(args))); } static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { ia32_setregs(td, imgp, stack); (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi32_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int error; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_rax; + sa->original_code = sa->code; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; /* * Fetch system call arguments. * * The vDSO has already made sure that the arguments are * eight-byte aligned. Pointers and size_t parameters are * zero-extended. This makes it possible to copy in the * arguments directly. As long as the call doesn't use 32-bit * data structures, we can just invoke the same system call * implementation used by 64-bit processes. */ error = copyin((void *)frame->tf_rcx, sa->args, sa->callp->sy_narg * sizeof(sa->args[0])); if (error != 0) return (error); /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* * System call succeeded. * * Simply copy out the 64-bit return values into the * same buffer provided for system call arguments. The * vDSO will copy them to the right spot, truncating * pointers and size_t values to 32 bits. */ frame->tf_rax = copyout(td->td_retval, (void *)frame->tf_rcx, sizeof(td->td_retval)) == 0 ? 0 : CLOUDABI_EFAULT; break; case ERESTART: /* Restart system call. */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_rax = cloudabi_convert_errno(error); break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; register_t retval[2]; /* Return values for processes returning from fork. */ if ((td->td_pflags & TDP_FORKING) != 0) { retval[0] = CLOUDABI_PROCESS_CHILD; retval[1] = td->td_tid; copyout(retval, (void *)frame->tf_rcx, sizeof(retval)); } } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { stack_t stack; uint32_t args[3]; void *frameptr; int error; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len - sizeof(args); cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Copy the arguments for the thread entry point onto the stack * (args[1] and args[2]). Similar to process startup, use the * otherwise unused return address (args[0]) for TLS. */ args[0] = tcb; args[1] = td->td_tid; args[2] = attr->argument; frameptr = (void *)td->td_frame->tf_rsp; error = copyout(args, frameptr, sizeof(args)); if (error != 0) return (error); return (cpu_set_user_tls(td, frameptr)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup_tcb, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_minuser = FREEBSD32_MINUSER, .sv_maxuser = FREEBSD32_MAXUSER, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_fixlimit = ia32_fixlimit, .sv_maxssiz = &ia32_maxssiz, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_386, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; diff --git a/sys/amd64/cloudabi64/cloudabi64_sysvec.c b/sys/amd64/cloudabi64/cloudabi64_sysvec.c index d3893902b08e..c08d912e84d4 100644 --- a/sys/amd64/cloudabi64/cloudabi64_sysvec.c +++ b/sys/amd64/cloudabi64/cloudabi64_sysvec.c @@ -1,223 +1,224 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi64_syscallnames[]; extern struct sysent cloudabi64_sysent[]; static int cloudabi64_fixup_tcb(uintptr_t *stack_base, struct image_params *imgp) { int error; register_t tcbptr; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi64_fixup(stack_base, imgp); if (error != 0) return (error); /* * On x86-64, the TCB is referred to by %fs:0. Take some space * from the top of the stack to store a single element array, * containing a pointer to the TCB. %fs base will point to this. */ tcbptr = (register_t)*stack_base; *stack_base -= sizeof(tcbptr); return (copyout(&tcbptr, (void *)*stack_base, sizeof(tcbptr))); } static void cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB, the TCB itself, * and the auxiliary vector. Let %rdx point to the auxiliary * vector, and set %fs base to the address of the TCB. */ regs = td->td_frame; regs->tf_rdi = stack + sizeof(register_t) + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi64_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_rax; + sa->original_code = sa->code; if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi64_sysent[sa->code]; /* Fetch system call arguments. */ sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; /* Actually %r10. */ sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void cloudabi64_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_rax = td->td_retval[0]; frame->tf_rdx = td->td_retval[1]; frame->tf_rflags &= ~PSL_C; break; case ERESTART: /* Restart system call. */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_rax = cloudabi_convert_errno(error); frame->tf_rflags |= PSL_C; break; } } static void cloudabi64_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* Initial register values for processes returning from fork. */ frame->tf_rax = CLOUDABI_PROCESS_CHILD; frame->tf_rdx = td->td_tid; } int cloudabi64_thread_setregs(struct thread *td, const cloudabi64_threadattr_t *attr, uint64_t tcb) { struct trapframe *frame; stack_t stack; uint64_t tcbptr; int error; /* * On x86-64, the TCB is referred to by %fs:0. Take some space * from the top of the stack to store a single element array, * containing a pointer to the TCB. %fs base will point to this. */ tcbptr = rounddown(attr->stack + attr->stack_len - sizeof(tcbptr), _Alignof(tcbptr)); error = copyout(&tcb, (void *)tcbptr, sizeof(tcb)); if (error != 0) return (error); /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = tcbptr - attr->stack; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_rdi = td->td_tid; frame->tf_rsi = attr->argument; return (cpu_set_user_tls(td, TO_PTR(tcbptr))); } static struct sysentvec cloudabi64_elf_sysvec = { .sv_size = CLOUDABI64_SYS_MAXSYSCALL, .sv_table = cloudabi64_sysent, .sv_fixup = cloudabi64_fixup_tcb, .sv_name = "CloudABI ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf64_prepare_notes, .sv_minuser = VM_MIN_ADDRESS, /* Keep top page reserved to work around AMD Ryzen stability issues. */ .sv_maxuser = VM_MAXUSER_ADDRESS - PAGE_SIZE, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi64_copyout_strings, .sv_setregs = cloudabi64_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64, .sv_set_syscall_retval = cloudabi64_set_syscall_retval, .sv_fetch_syscall_args = cloudabi64_fetch_syscall_args, .sv_syscallnames = cloudabi64_syscallnames, .sv_schedtail = cloudabi64_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec); Elf64_Brandinfo cloudabi64_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_X86_64, .sysvec = &cloudabi64_elf_sysvec, .flags = BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC, }; diff --git a/sys/amd64/ia32/ia32_syscall.c b/sys/amd64/ia32/ia32_syscall.c index 6c9399d1a52f..9294ef8ce741 100644 --- a/sys/amd64/ia32/ia32_syscall.c +++ b/sys/amd64/ia32/ia32_syscall.c @@ -1,280 +1,281 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IDTVEC(name) __CONCAT(X,name) extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti), IDTVEC(rsvd), IDTVEC(rsvd_pti); void ia32_syscall(struct trapframe *frame); /* Called from asm code */ void ia32_set_syscall_retval(struct thread *td, int error) { cpu_set_syscall_retval(td, error); } int ia32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; caddr_t params; u_int32_t args[8], tmp; int error, i; #ifdef COMPAT_43 u_int32_t eip; int cs; #endif p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; #ifdef COMPAT_43 if (__predict_false(frame->tf_cs == 7 && frame->tf_rip == 2)) { /* * In lcall $7,$0 after int $0x80. Convert the user * frame to what it would be for a direct int 0x80 instead * of lcall $7,$0, by popping the lcall return address. */ error = fueword32((void *)frame->tf_rsp, &eip); if (error == -1) return (EFAULT); cs = fuword16((void *)(frame->tf_rsp + sizeof(u_int32_t))); if (cs == -1) return (EFAULT); /* * Unwind in-kernel frame after all stack frame pieces * were successfully read. */ frame->tf_rip = eip; frame->tf_cs = cs; frame->tf_rsp += 2 * sizeof(u_int32_t); frame->tf_err = 7; /* size of lcall $7,$0 */ } #endif params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t); sa->code = frame->tf_rax; + sa->original_code = sa->code; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword32(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. * We use a 32-bit fetch in case params is not * aligned. */ error = fueword32(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; if (params != NULL && sa->callp->sy_narg != 0) error = copyin(params, (caddr_t)args, (u_int)(sa->callp->sy_narg * sizeof(int))); else error = 0; for (i = 0; i < sa->callp->sy_narg; i++) sa->args[i] = args[i]; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" void ia32_syscall(struct trapframe *frame) { struct thread *td; register_t orig_tf_rflags; ksiginfo_t ksi; orig_tf_rflags = frame->tf_rflags; td = curthread; td->td_frame = frame; syscallenter(td); /* * Traced syscall. */ if (orig_tf_rflags & PSL_T) { frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_rip; trapsignal(td, &ksi); } syscallret(td); amd64_syscall_ret_flush_l1d(td->td_errno); } static void ia32_syscall_enable(void *dummy) { setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) : &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); } static void ia32_syscall_disable(void *dummy) { setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); } SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL); SYSUNINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_disable, NULL); #ifdef COMPAT_43 int setup_lcall_gate(void) { struct i386_ldt_args uap; struct user_segment_descriptor desc; uint32_t lcall_addr; int error; bzero(&uap, sizeof(uap)); uap.start = 0; uap.num = 1; lcall_addr = curproc->p_sysent->sv_psstrings - sz_lcall_tramp; bzero(&desc, sizeof(desc)); desc.sd_type = SDT_MEMERA; desc.sd_dpl = SEL_UPL; desc.sd_p = 1; desc.sd_def32 = 1; desc.sd_gran = 1; desc.sd_lolimit = 0xffff; desc.sd_hilimit = 0xf; desc.sd_lobase = lcall_addr; desc.sd_hibase = lcall_addr >> 24; error = amd64_set_ldt(curthread, &uap, &desc); if (error != 0) return (error); return (0); } #endif diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 59796e729ac4..0f8cf50e326d 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -1,124 +1,125 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ #include #include #include /* * List of locks * c - proc lock * k - only accessed by curthread * pp - pmap.c:invl_gen_mtx */ struct proc_ldt { caddr_t ldt_base; int ldt_refcnt; }; #define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL struct pmap_invl_gen { u_long gen; /* (k) */ union { LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ struct { struct pmap_invl_gen *next; u_char saved_pri; }; }; } __aligned(16); /* * Machine-dependent part of the proc structure for AMD64. */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ register_t md_spurflt_addr; /* (k) Spurious page fault address. */ struct pmap_invl_gen md_invl_gen; register_t md_efirt_tmp; /* (k) */ int md_efirt_dis_pf; /* (k) */ struct pcb md_pcb; vm_offset_t md_stack_base; }; struct mdproc { struct proc_ldt *md_ldt; /* (t) per-process ldt */ struct system_segment_descriptor md_ldt_sd; u_int md_flags; /* (c) md process flags P_MD */ }; #define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */ #define P_MD_LA48 0x00000002 /* Request LA48 after exec */ #define P_MD_LA57 0x00000004 /* Request LA57 after exec */ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 768 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[8]; }; #ifdef _KERNEL /* Get the current kernel thread stack usage. */ #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE; \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) struct proc_ldt *user_ldt_alloc(struct proc *, int); void user_ldt_free(struct thread *); struct sysarch_args; int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space); int amd64_set_ldt_data(struct thread *td, int start, int num, struct user_segment_descriptor *descs); extern struct mtx dt_lock; extern int max_ldt_segment; #define NARGREGS 6 #endif /* _KERNEL */ #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index c5538932b1e3..566af6de29e7 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -1,945 +1,946 @@ /*- * Copyright (c) 2013 Dmitry Chagin * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux64, 1); static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux_locore_o_start; extern char _binary_linux_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static int linux_vsyscall(struct thread *td); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rdi; sa->args[1] = frame->tf_rsi; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rcx; sa->args[4] = frame->tf_r8; sa->args[5] = frame->tf_r9; sa->code = frame->tf_rax; + sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_rax = td->td_retval[0]; frame->tf_r10 = frame->tf_rcx; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes, * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. * We saved this in tf_err. * */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; break; case EJUSTRETURN: break; default: frame->tf_rax = bsd_to_linux_errno(error); frame->tf_r10 = frame->tf_rcx; break; } /* * Differently from FreeBSD native ABI, on Linux only %rcx * and %r11 values are not preserved across the syscall. * Require full context restore to get all registers except * those two restored at return to usermode. * * XXX: Would be great to be able to avoid PCB_FULL_IRET * for the error == 0 case. */ set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { Elf_Auxargs *args; Elf_Auxinfo *argarray, *pos; struct proc *p; int error, issetugid; p = imgp->proc; args = (Elf64_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf_Addr *base; base = (Elf64_Addr *)*stack_base; base--; if (suword(base, (uint64_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf64_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * Starting with 2.24, glibc depends on a 16-byte stack alignment. * One "long argc" will be prepended later. */ vectp = (char **)((((uintptr_t)vectp + 8) & ~0xF) - 8); /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; set_pcb_flags(pcb, PCB_FULL_IRET); saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } /* * Copied from amd64/amd64/machdep.c * * XXX fpu state need? don't think so */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_ucontext uc; struct l_sigcontext *context; struct trapframe *regs; unsigned long rflags; int error; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); if (error != 0) return (error); p = td->td_proc; context = &uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; int sig, code; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Save user context. */ bzero(&sf, sizeof(sf)); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe); } else sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &sf.sf_si, sig); sf.sf_handler = catcher; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #define LINUX_VSYSCALL_START (-10UL << 20) #define LINUX_VSYSCALL_SZ 1024 const unsigned long linux_vsyscall_vector[] = { LINUX_SYS_gettimeofday, LINUX_SYS_linux_time, LINUX_SYS_linux_getcpu, }; static int linux_vsyscall(struct thread *td) { struct trapframe *frame; uint64_t retqaddr; int code, traced; int error; frame = td->td_frame; /* Check %rip for vsyscall area. */ if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) return (EINVAL); if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) return (EINVAL); code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; if (code >= nitems(linux_vsyscall_vector)) return (EINVAL); /* * vsyscall called as callq *(%rax), so we must * use return address from %rsp and also fixup %rsp. */ error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); if (error) return (error); frame->tf_rip = retqaddr; frame->tf_rax = linux_vsyscall_vector[code]; frame->tf_rsp += 8; traced = (frame->tf_flags & PSL_T); amd64_syscall(td, traced); return (0); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, .sv_usrstack = USRSTACK_LA48, .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = linux_vsyscall, .sv_onexec = linux_on_exec, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static void linux_vdso_install(void *param) { amd64_lower_shared_page(&elf_linux_sysvec); linux_szsigcode = (&_binary_linux_locore_o_end - &_binary_linux_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj, linux_shared_page_mapping); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static char GNULINUX_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNULINUX_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_glibc2brandshort = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf64_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_X86_64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-x86_64.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, &linux_glibc2brandshort, &linux_muslbrand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux x86-64 ELF exec handler installed\n"); } else printf("cannot insert Linux x86-64 ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "Linux 64bit support"); diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 504d57e418a5..7dfd57a74a1e 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -1,1115 +1,1116 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2003 Peter Wemm * Copyright (c) 2002 Doug Rabson * Copyright (c) 1998-1999 Andrew Gallatin * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #ifndef COMPAT_FREEBSD32 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!" #endif #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX32_MAXUSER ((1ul << 32) - PAGE_SIZE) #define LINUX32_SHAREDPAGE (LINUX32_MAXUSER - PAGE_SIZE) #define LINUX32_USRSTACK LINUX32_SHAREDPAGE static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux32_locore_o_start; extern char _binary_linux32_locore_o_end; extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux32_fixlimit(struct rlimit *rl, int which); static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); static void linux32_set_syscall_retval(struct thread *td, int error); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux32_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary)); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, 0); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp)); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { Elf32_Addr *base; base = (Elf32_Addr *)*stack_base; base--; if (suword32(base, (uint32_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn and libgcc unwind. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_esp = regs->tf_rsp; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = linux32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; sigset_t bmask; l_sigset_t lmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; lmask.__mask = frame.sf_extramask[0]; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; regs->tf_rbx = frame.sf_sc.sc_ebx; regs->tf_rdx = frame.sf_sc.sc_edx; regs->tf_rcx = frame.sf_sc.sc_ecx; regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_es = frame.sf_sc.sc_es; regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_rflags)) return(EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return(EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* * Restore signal context */ regs->tf_gs = context->sc_gs; regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; regs->tf_rbx = context->sc_ebx; regs->tf_rdx = context->sc_edx; regs->tf_rcx = context->sc_ecx; regs->tf_rax = context->sc_eax; regs->tf_rip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* * call sigaltstack & ignore results.. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux32_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->args[0] = frame->tf_rbx; sa->args[1] = frame->tf_rcx; sa->args[2] = frame->tf_rdx; sa->args[3] = frame->tf_rsi; sa->args[4] = frame->tf_rdi; sa->args[5] = frame->tf_rbp; /* Unconfirmed */ sa->code = frame->tf_rax; + sa->original_code = sa->code; if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } static void linux32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_rax = bsd_to_linux_errno(error); } } /* * Clear registers on exec * XXX copied from ia32_signal.c. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); pcb->pcb_initial_fpucw = __LINUX_NPXCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_gs = _ugssel; regs->tf_fs = _ufssel; regs->tf_es = _udatasel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = (register_t)imgp->ps_strings; x86_clear_dbregs(pcb); fpstate_drop(td); /* Do full restore on return so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); } /* * XXX copied from ia32_sysvec.c. */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; u_int32_t *vectp; char *stringp; uintptr_t destp, ustringp; struct linux32_ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(uint32_t)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(uint32_t)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(uint32_t)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(uint32_t)); } vectp = (uint32_t *)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword32(vectp++, 0) != 0) return (EFAULT); if (suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword32(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword32(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "32-bit Linux emulation"); static u_long linux32_maxdsiz = LINUX32_MAXDSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW, &linux32_maxdsiz, 0, ""); static u_long linux32_maxssiz = LINUX32_MAXSSIZ; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW, &linux32_maxssiz, 0, ""); static u_long linux32_maxvmem = LINUX32_MAXVMEM; SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW, &linux32_maxvmem, 0, ""); static void linux32_fixlimit(struct rlimit *rl, int which) { switch (which) { case RLIMIT_DATA: if (linux32_maxdsiz != 0) { if (rl->rlim_cur > linux32_maxdsiz) rl->rlim_cur = linux32_maxdsiz; if (rl->rlim_max > linux32_maxdsiz) rl->rlim_max = linux32_maxdsiz; } break; case RLIMIT_STACK: if (linux32_maxssiz != 0) { if (rl->rlim_cur > linux32_maxssiz) rl->rlim_cur = linux32_maxssiz; if (rl->rlim_max > linux32_maxssiz) rl->rlim_max = linux32_maxssiz; } break; case RLIMIT_VMEM: if (linux32_maxvmem != 0) { if (rl->rlim_cur > linux32_maxvmem) rl->rlim_cur = linux32_maxvmem; if (rl->rlim_max > linux32_maxvmem) rl->rlim_max = linux32_maxvmem; } break; } } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX32_SYS_MAXSYSCALL, .sv_table = linux32_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux32_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = LINUX32_MAXUSER, .sv_usrstack = LINUX32_USRSTACK, .sv_psstrings = LINUX32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = linux32_fixlimit, .sv_maxssiz = &linux32_maxssiz, .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux32_set_syscall_retval, .sv_fetch_syscall_args = linux32_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux32_locore_o_end - &_binary_linux32_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj, linux_shared_page_mapping); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux32_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux32_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux32_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux32_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux32_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/arm/arm/syscall.c b/sys/arm/arm/syscall.c index a851db6e4556..a635de0ec716 100644 --- a/sys/arm/arm/syscall.c +++ b/sys/arm/arm/syscall.c @@ -1,170 +1,171 @@ /* $NetBSD: fault.c,v 1.45 2003/11/20 14:44:36 scw Exp $ */ /*- * Copyright 2004 Olivier Houchard * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 1994-1997 Mark Brinicombe. * Copyright (c) 1994 Brini. * All rights reserved. * * This code is derived from software written for Brini by Mark Brinicombe * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Brini. * 4. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * RiscBSD kernel project * * fault.c * * Fault handlers * * Created : 28/11/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include void swi_handler(struct trapframe *); int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap; struct syscall_args *sa; u_int nap; int error; nap = 4; sa = &td->td_sa; sa->code = td->td_frame->tf_r7; + sa->original_code = sa->code; ap = &td->td_frame->tf_r0; if (sa->code == SYS_syscall) { sa->code = *ap++; nap--; } else if (sa->code == SYS___syscall) { sa->code = ap[_QUAD_LOWWORD]; nap -= 2; ap += 2; } p = td->td_proc; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; error = 0; memcpy(sa->args, ap, nap * sizeof(register_t)); if (sa->callp->sy_narg > nap) { error = copyin((void *)td->td_frame->tf_usr_sp, sa->args + nap, (sa->callp->sy_narg - nap) * sizeof(register_t)); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = 0; } return (error); } #include "../../kern/subr_syscall.c" static void syscall(struct thread *td, struct trapframe *frame) { syscallenter(td); syscallret(td); } void swi_handler(struct trapframe *frame) { struct thread *td = curthread; td->td_frame = frame; td->td_pticks = 0; /* * Enable interrupts if they were enabled before the exception. * Since all syscalls *should* come from user mode it will always * be safe to enable them, but check anyway. */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(frame->tf_spsr & PSR_I) == 0) enable_interrupts(PSR_I); if (__predict_true(frame->tf_spsr & PSR_F) == 0) enable_interrupts(PSR_F); } syscall(td, frame); } diff --git a/sys/arm/cloudabi32/cloudabi32_sysvec.c b/sys/arm/cloudabi32/cloudabi32_sysvec.c index a8c5da47d265..4df57b22e13d 100644 --- a/sys/arm/cloudabi32/cloudabi32_sysvec.c +++ b/sys/arm/cloudabi32/cloudabi32_sysvec.c @@ -1,198 +1,199 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let r0 point to the auxiliary vector, and set * tpidrurw to the TCB. */ regs = td->td_frame; regs->tf_r0 = stack + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi32_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int error; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_r12; + sa->original_code = sa->code; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; /* Fetch system call arguments from registers and the stack. */ sa->args[0] = frame->tf_r0; sa->args[1] = frame->tf_r1; sa->args[2] = frame->tf_r2; sa->args[3] = frame->tf_r3; if (sa->callp->sy_narg > 4) { error = copyin((void *)td->td_frame->tf_usr_sp, &sa->args[4], (sa->callp->sy_narg - 4) * sizeof(register_t)); if (error != 0) return (error); } /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_r1; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_r0 = td->td_retval[0]; frame->tf_r1 = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_pc -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_r0 = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_r0 = CLOUDABI_PROCESS_CHILD; frame->tf_r1 = td->td_tid; } } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_r0 = td->td_tid; frame->tf_r1 = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, TO_PTR(tcb))); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_ARM, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; diff --git a/sys/arm/include/proc.h b/sys/arm/include/proc.h index a37ccd8f621c..9566c264731e 100644 --- a/sys/arm/include/proc.h +++ b/sys/arm/include/proc.h @@ -1,82 +1,83 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * from: FreeBSD: src/sys/i386/include/proc.h,v 1.11 2001/06/29 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ #include struct md_utrap { utrap_entry_t *ut_precise[UT_MAX]; /* must be first */ int ut_refcnt; }; struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_cspr; /* (k) */ register_t md_spurflt_addr; /* (k) Spurious page fault address. */ int md_ptrace_instr; int md_ptrace_addr; int md_ptrace_instr_alt; int md_ptrace_addr_alt; }; struct mdproc { struct md_utrap *md_utrap; void *md_sigtramp; }; #define KINFO_PROC_SIZE 816 #define MAXARGS 8 /* * This holds the syscall state for a single system call. * As some syscall arguments may be 64-bit aligned we need to ensure the * args value is 64-bit aligned. The ABI will then ensure any 64-bit * arguments are already correctly aligned, even if they were passed in * via registers, we just need to make sure we copy them to an aligned * buffer. */ struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[MAXARGS]; } __aligned(8); #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/arm64/arm64/elf32_machdep.c b/sys/arm64/arm64/elf32_machdep.c index 206413b45172..7cedbffc4d43 100644 --- a/sys/arm64/arm64/elf32_machdep.c +++ b/sys/arm64/arm64/elf32_machdep.c @@ -1,280 +1,281 @@ /*- * Copyright (c) 2014, 2015 The FreeBSD Foundation. * Copyright (c) 2014, 2017 Andrew Turner. * Copyright (c) 2018 Olivier Houchard * All rights reserved. * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define __ELF_WORD_SIZE 32 #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #include #define FREEBSD32_MINUSER 0x00001000 #define FREEBSD32_MAXUSER ((1ul << 32) - PAGE_SIZE) #define FREEBSD32_SHAREDPAGE (FREEBSD32_MAXUSER - PAGE_SIZE) #define FREEBSD32_USRSTACK FREEBSD32_SHAREDPAGE extern const char *freebsd32_syscallnames[]; extern char aarch32_sigcode[]; extern int sz_aarch32_sigcode; static int freebsd32_fetch_syscall_args(struct thread *td); static void freebsd32_setregs(struct thread *td, struct image_params *imgp, u_long stack); static void freebsd32_set_syscall_retval(struct thread *, int); static boolean_t elf32_arm_abi_supported(struct image_params *, int32_t *, uint32_t *); extern void freebsd32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static struct sysentvec elf32_freebsd_sysvec = { .sv_size = SYS_MAXSYSCALL, .sv_table = freebsd32_sysent, .sv_transtrap = NULL, .sv_fixup = elf32_freebsd_fixup, .sv_sendsig = freebsd32_sendsig, .sv_sigcode = aarch32_sigcode, .sv_szsigcode = &sz_aarch32_sigcode, .sv_name = "FreeBSD ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_minuser = FREEBSD32_MINUSER, .sv_maxuser = FREEBSD32_MAXUSER, .sv_usrstack = FREEBSD32_USRSTACK, .sv_psstrings = FREEBSD32_PS_STRINGS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_auxargs = elf32_freebsd_copyout_auxargs, .sv_copyout_strings = freebsd32_copyout_strings, .sv_setregs = freebsd32_setregs, .sv_fixlimit = NULL, // XXX .sv_maxssiz = NULL, .sv_flags = SV_ABI_FREEBSD | SV_ILP32 | SV_SHP | SV_TIMEKEEP | SV_RNG_SEED_VER, .sv_set_syscall_retval = freebsd32_set_syscall_retval, .sv_fetch_syscall_args = freebsd32_fetch_syscall_args, .sv_syscallnames = freebsd32_syscallnames, .sv_shared_page_base = FREEBSD32_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_onexec_old = exec_onexec_old, .sv_onexit = exit_onexit, }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); static Elf32_Brandinfo freebsd32_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_ARM, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", .sysvec = &elf32_freebsd_sysvec, .interp_newpath = "/libexec/ld-elf32.so.1", .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, .header_supported= elf32_arm_abi_supported, }; SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)elf32_insert_brand_entry, &freebsd32_brand_info); static boolean_t elf32_arm_abi_supported(struct image_params *imgp, int32_t *osrel __unused, uint32_t *fctl0 __unused) { const Elf32_Ehdr *hdr; /* Check if we support AArch32 */ if (ID_AA64PFR0_EL0_VAL(READ_SPECIALREG(id_aa64pfr0_el1)) != ID_AA64PFR0_EL0_64_32) return (FALSE); #define EF_ARM_EABI_VERSION(x) (((x) & EF_ARM_EABIMASK) >> 24) #define EF_ARM_EABI_FREEBSD_MIN 4 hdr = (const Elf32_Ehdr *)imgp->image_header; if (EF_ARM_EABI_VERSION(hdr->e_flags) < EF_ARM_EABI_FREEBSD_MIN) { if (bootverbose) uprintf("Attempting to execute non EABI binary " "(rev %d) image %s", EF_ARM_EABI_VERSION(hdr->e_flags), imgp->args->fname); return (FALSE); } return (TRUE); } static int freebsd32_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap; struct syscall_args *sa; int error, i, nap, narg; unsigned int args[4]; nap = 4; p = td->td_proc; ap = td->td_frame->tf_x; sa = &td->td_sa; /* r7 is the syscall id */ sa->code = td->td_frame->tf_x[7]; + sa->original_code = sa->code; if (sa->code == SYS_syscall) { sa->code = *ap++; nap--; } else if (sa->code == SYS___syscall) { sa->code = ap[1]; nap -= 2; ap += 2; } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; narg = sa->callp->sy_narg; for (i = 0; i < nap; i++) sa->args[i] = ap[i]; if (narg > nap) { if (narg - nap > nitems(args)) panic("Too many system call arguiments"); error = copyin((void *)td->td_frame->tf_x[13], args, (narg - nap) * sizeof(int)); for (i = 0; i < (narg - nap); i++) sa->args[i + nap] = args[i]; } td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } static void freebsd32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_x[0] = td->td_retval[0]; frame->tf_x[1] = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* * Reconstruct the pc to point at the swi. */ if ((frame->tf_spsr & PSR_T) != 0) frame->tf_elr -= 2; //THUMB_INSN_SIZE; else frame->tf_elr -= 4; //INSN_SIZE; break; case EJUSTRETURN: /* nothing to do */ break; default: frame->tf_x[0] = error; frame->tf_spsr |= PSR_C; break; } } static void freebsd32_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *tf = td->td_frame; struct pcb *pcb = td->td_pcb; memset(tf, 0, sizeof(struct trapframe)); /* * We need to set x0 for init as it doesn't call * cpu_set_syscall_retval to copy the value. We also * need to set td_retval for the cases where we do. */ tf->tf_x[0] = stack; /* SP_usr is mapped to x13 */ tf->tf_x[13] = stack; /* LR_usr is mapped to x14 */ tf->tf_x[14] = imgp->entry_addr; tf->tf_elr = imgp->entry_addr; tf->tf_spsr = PSR_M_32; if ((uint32_t)imgp->entry_addr & 1) tf->tf_spsr |= PSR_T; #ifdef VFP vfp_reset_state(td, pcb); #endif /* * Clear debug register state. It is not applicable to the new process. */ bzero(&pcb->pcb_dbg_regs, sizeof(pcb->pcb_dbg_regs)); } void elf32_dump_thread(struct thread *td, void *dst, size_t *off) { /* XXX: VFP */ } diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c index e876fa5011c3..c6dd8c276414 100644 --- a/sys/arm64/arm64/trap.c +++ b/sys/arm64/arm64/trap.c @@ -1,613 +1,614 @@ /*- * Copyright (c) 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif #ifdef VFP #include #endif #ifdef KDB #include #endif #ifdef DDB #include #endif /* Called from exception.S */ void do_el1h_sync(struct thread *, struct trapframe *); void do_el0_sync(struct thread *, struct trapframe *); void do_el0_error(struct trapframe *); void do_serror(struct trapframe *); void unhandled_exception(struct trapframe *); static void print_registers(struct trapframe *frame); int (*dtrace_invop_jump_addr)(struct trapframe *); typedef void (abort_handler)(struct thread *, struct trapframe *, uint64_t, uint64_t, int); static abort_handler align_abort; static abort_handler data_abort; static abort_handler external_abort; static abort_handler *abort_handlers[] = { [ISS_DATA_DFSC_TF_L0] = data_abort, [ISS_DATA_DFSC_TF_L1] = data_abort, [ISS_DATA_DFSC_TF_L2] = data_abort, [ISS_DATA_DFSC_TF_L3] = data_abort, [ISS_DATA_DFSC_AFF_L1] = data_abort, [ISS_DATA_DFSC_AFF_L2] = data_abort, [ISS_DATA_DFSC_AFF_L3] = data_abort, [ISS_DATA_DFSC_PF_L1] = data_abort, [ISS_DATA_DFSC_PF_L2] = data_abort, [ISS_DATA_DFSC_PF_L3] = data_abort, [ISS_DATA_DFSC_ALIGN] = align_abort, [ISS_DATA_DFSC_EXT] = external_abort, }; static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr, int trapno) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; ksi.ksi_trapno = trapno; trapsignal(td, &ksi); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap, *dst_ap; struct syscall_args *sa; p = td->td_proc; sa = &td->td_sa; ap = td->td_frame->tf_x; dst_ap = &sa->args[0]; sa->code = td->td_frame->tf_x[8]; + sa->original_code = sa->code; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall)) { sa->code = *ap++; } else { *dst_ap++ = *ap++; } if (__predict_false(sa->code >= p->p_sysent->sv_size)) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Syscall %d takes too many arguments", sa->code)); memcpy(dst_ap, ap, (MAXARGS - 1) * sizeof(register_t)); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" /* * Test for fault generated by given access instruction in * bus_peek_ or bus_poke_ bus function. */ extern uint32_t generic_bs_peek_1f, generic_bs_peek_2f; extern uint32_t generic_bs_peek_4f, generic_bs_peek_8f; extern uint32_t generic_bs_poke_1f, generic_bs_poke_2f; extern uint32_t generic_bs_poke_4f, generic_bs_poke_8f; static bool test_bs_fault(void *addr) { return (addr == &generic_bs_peek_1f || addr == &generic_bs_peek_2f || addr == &generic_bs_peek_4f || addr == &generic_bs_peek_8f || addr == &generic_bs_poke_1f || addr == &generic_bs_poke_2f || addr == &generic_bs_poke_4f || addr == &generic_bs_poke_8f); } static void svc_handler(struct thread *td, struct trapframe *frame) { if ((frame->tf_esr & ESR_ELx_ISS_MASK) == 0) { syscallenter(td); syscallret(td); } else { call_trapsignal(td, SIGILL, ILL_ILLOPN, (void *)frame->tf_elr, ESR_ELx_EXCEPTION(frame->tf_esr)); userret(td, frame); } } static void align_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { if (!lower) { print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("Misaligned access from kernel space!"); } call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_elr, ESR_ELx_EXCEPTION(frame->tf_esr)); userret(td, frame); } static void external_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { /* * Try to handle synchronous external aborts caused by * bus_space_peek() and/or bus_space_poke() functions. */ if (!lower && test_bs_fault((void *)frame->tf_elr)) { frame->tf_elr = (uint64_t)generic_bs_fault; return; } print_registers(frame); printf(" far: %16lx\n", far); panic("Unhandled EL%d external data abort", lower ? 0: 1); } static void data_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { struct vm_map *map; struct proc *p; struct pcb *pcb; vm_prot_t ftype; int error, sig, ucode; #ifdef KDB bool handled; #endif /* * According to the ARMv8-A rev. A.g, B2.10.5 "Load-Exclusive * and Store-Exclusive instruction usage restrictions", state * of the exclusive monitors after data abort exception is unknown. */ clrex(); #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif pcb = td->td_pcb; p = td->td_proc; if (lower) map = &p->p_vmspace->vm_map; else { intr_enable(); /* The top bit tells us which range to use */ if (far >= VM_MAXUSER_ADDRESS) { map = kernel_map; } else { map = &p->p_vmspace->vm_map; if (map == NULL) map = kernel_map; } } /* * Try to handle translation, access flag, and permission faults. * Translation faults may occur as a result of the required * break-before-make sequence used when promoting or demoting * superpages. Such faults must not occur while holding the pmap lock, * or pmap_fault() will recurse on that lock. */ if ((lower || map == kernel_map || pcb->pcb_onfault != 0) && pmap_fault(map->pmap, esr, far) == KERN_SUCCESS) return; KASSERT(td->td_md.md_spinlock_count == 0, ("data abort with spinlock held")); if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("data abort in critical section or under mutex"); } switch (ESR_ELx_EXCEPTION(esr)) { case EXCP_INSN_ABORT: case EXCP_INSN_ABORT_L: ftype = VM_PROT_EXECUTE; break; default: ftype = (esr & ISS_DATA_WnR) == 0 ? VM_PROT_READ : VM_PROT_WRITE; break; } /* Fault in the page. */ error = vm_fault_trap(map, far, ftype, VM_FAULT_NORMAL, &sig, &ucode); if (error != KERN_SUCCESS) { if (lower) { call_trapsignal(td, sig, ucode, (void *)far, ESR_ELx_EXCEPTION(esr)); } else { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != 0) { frame->tf_x[0] = error; frame->tf_elr = pcb->pcb_onfault; return; } printf("Fatal data abort:\n"); print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(ESR_ELx_EXCEPTION(esr), 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("vm_fault failed: %lx error %d", frame->tf_elr, error); } } if (lower) userret(td, frame); } static void print_registers(struct trapframe *frame) { u_int reg; for (reg = 0; reg < nitems(frame->tf_x); reg++) { printf(" %sx%d: %16lx\n", (reg < 10) ? " " : "", reg, frame->tf_x[reg]); } printf(" sp: %16lx\n", frame->tf_sp); printf(" lr: %16lx\n", frame->tf_lr); printf(" elr: %16lx\n", frame->tf_elr); printf("spsr: %8x\n", frame->tf_spsr); } void do_el1h_sync(struct thread *td, struct trapframe *frame) { uint32_t exception; uint64_t esr, far; int dfsc; /* Read the esr register to get the exception details */ esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR4(KTR_TRAP, "do_el1_sync: curthread: %p, esr %lx, elr: %lx, frame: %p", td, esr, frame->tf_elr, frame); /* * Enable debug exceptions if we aren't already handling one. They will * be masked again in the exception handler's epilogue. */ if (exception != EXCP_BRK && exception != EXCP_WATCHPT_EL1 && exception != EXCP_SOFTSTP_EL1) dbg_enable(); switch (exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: #ifdef VFP if ((td->td_pcb->pcb_fpflags & PCB_FP_KERN) != 0) { vfp_restore_state(); } else #endif { print_registers(frame); printf(" esr: %.8lx\n", esr); panic("VFP exception in the kernel"); } break; case EXCP_INSN_ABORT: case EXCP_DATA_ABORT: far = READ_SPECIALREG(far_el1); dfsc = esr & ISS_DATA_DFSC_MASK; if (dfsc < nitems(abort_handlers) && abort_handlers[dfsc] != NULL) { abort_handlers[dfsc](td, frame, esr, far, 0); } else { print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("Unhandled EL1 %s abort: %x", exception == EXCP_INSN_ABORT ? "instruction" : "data", dfsc); } break; case EXCP_BRK: #ifdef KDTRACE_HOOKS if ((esr & ESR_ELx_ISS_MASK) == 0x40d && \ dtrace_invop_jump_addr != 0) { dtrace_invop_jump_addr(frame); break; } #endif #ifdef KDB kdb_trap(exception, 0, frame); #else panic("No debugger in kernel."); #endif break; case EXCP_WATCHPT_EL1: case EXCP_SOFTSTP_EL1: #ifdef KDB kdb_trap(exception, 0, frame); #else panic("No debugger in kernel."); #endif break; case EXCP_UNKNOWN: if (undef_insn(1, frame)) break; /* FALLTHROUGH */ default: print_registers(frame); printf(" far: %16lx\n", READ_SPECIALREG(far_el1)); panic("Unknown kernel exception %x esr_el1 %lx", exception, esr); } } void do_el0_sync(struct thread *td, struct trapframe *frame) { pcpu_bp_harden bp_harden; uint32_t exception; uint64_t esr, far; int dfsc; /* Check we have a sane environment when entering from userland */ KASSERT((uintptr_t)get_pcpu() >= VM_MIN_KERNEL_ADDRESS, ("Invalid pcpu address from userland: %p (tpidr %lx)", get_pcpu(), READ_SPECIALREG(tpidr_el1))); esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); switch (exception) { case EXCP_INSN_ABORT_L: far = READ_SPECIALREG(far_el1); /* * Userspace may be trying to train the branch predictor to * attack the kernel. If we are on a CPU affected by this * call the handler to clear the branch predictor state. */ if (far > VM_MAXUSER_ADDRESS) { bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } break; case EXCP_UNKNOWN: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: case EXCP_WATCHPT_EL0: far = READ_SPECIALREG(far_el1); break; } intr_enable(); CTR4(KTR_TRAP, "do_el0_sync: curthread: %p, esr %lx, elr: %lx, frame: %p", td, esr, frame->tf_elr, frame); switch (exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: #ifdef VFP vfp_restore_state(); #else panic("VFP exception in userland"); #endif break; case EXCP_SVC32: case EXCP_SVC64: svc_handler(td, frame); break; case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: dfsc = esr & ISS_DATA_DFSC_MASK; if (dfsc < nitems(abort_handlers) && abort_handlers[dfsc] != NULL) abort_handlers[dfsc](td, frame, esr, far, 1); else { print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("Unhandled EL0 %s abort: %x", exception == EXCP_INSN_ABORT_L ? "instruction" : "data", dfsc); } break; case EXCP_UNKNOWN: if (!undef_insn(0, frame)) call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far, exception); userret(td, frame); break; case EXCP_SP_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_sp, exception); userret(td, frame); break; case EXCP_PC_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_BRKPT_EL0: case EXCP_BRK: call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_WATCHPT_EL0: call_trapsignal(td, SIGTRAP, TRAP_TRACE, (void *)far, exception); userret(td, frame); break; case EXCP_MSR: /* * The CPU can raise EXCP_MSR when userspace executes an mrs * instruction to access a special register userspace doesn't * have access to. */ if (!undef_insn(0, frame)) call_trapsignal(td, SIGILL, ILL_PRVOPC, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_SOFTSTP_EL0: td->td_frame->tf_spsr &= ~PSR_SS; td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP; WRITE_SPECIALREG(mdscr_el1, READ_SPECIALREG(mdscr_el1) & ~DBG_MDSCR_SS); call_trapsignal(td, SIGTRAP, TRAP_TRACE, (void *)frame->tf_elr, exception); userret(td, frame); break; default: call_trapsignal(td, SIGBUS, BUS_OBJERR, (void *)frame->tf_elr, exception); userret(td, frame); break; } KASSERT((td->td_pcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, ("Kernel VFP flags set while entering userspace")); KASSERT( td->td_pcb->pcb_fpusaved == &td->td_pcb->pcb_fpustate, ("Kernel VFP state in use when entering userspace")); } /* * TODO: We will need to handle these later when we support ARMv8.2 RAS. */ void do_serror(struct trapframe *frame) { uint64_t esr, far; far = READ_SPECIALREG(far_el1); esr = frame->tf_esr; print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("Unhandled System Error"); } void unhandled_exception(struct trapframe *frame) { uint64_t esr, far; far = READ_SPECIALREG(far_el1); esr = frame->tf_esr; print_registers(frame); printf(" far: %16lx\n", far); printf(" esr: %.8lx\n", esr); panic("Unhandled exception"); } diff --git a/sys/arm64/cloudabi32/cloudabi32_sysvec.c b/sys/arm64/cloudabi32/cloudabi32_sysvec.c index 889393560ede..b7be5cc0e5e3 100644 --- a/sys/arm64/cloudabi32/cloudabi32_sysvec.c +++ b/sys/arm64/cloudabi32/cloudabi32_sysvec.c @@ -1,206 +1,207 @@ /*- * Copyright (c) 2015-2017 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; regs = td->td_frame; memset(regs, 0, sizeof(*regs)); regs->tf_x[0] = stack + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); regs->tf_x[13] = STACKALIGN(stack); regs->tf_elr = imgp->entry_addr; regs->tf_spsr |= PSR_AARCH32; (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi32_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int error; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_x[0]; + sa->original_code = sa->code; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; /* * Fetch system call arguments. * * The vDSO has already made sure that the arguments are * eight-byte aligned. Pointers and size_t parameters are * zero-extended. This makes it possible to copy in the * arguments directly. As long as the call doesn't use 32-bit * data structures, we can just invoke the same system call * implementation used by 64-bit processes. */ error = copyin((void *)frame->tf_x[2], sa->args, sa->callp->sy_narg * sizeof(sa->args[0])); if (error != 0) return (error); /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* * System call succeeded. * * Simply copy out the 64-bit return values into the * same buffer provided for system call arguments. The * vDSO will copy them to the right spot, truncating * pointers and size_t values to 32 bits. */ if (copyout(td->td_retval, (void *)frame->tf_x[2], sizeof(td->td_retval)) == 0) { frame->tf_x[0] = 0; frame->tf_spsr &= ~PSR_C; } else { frame->tf_x[0] = CLOUDABI_EFAULT; frame->tf_spsr |= PSR_C; } break; case ERESTART: /* Restart system call. */ frame->tf_elr -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_x[0] = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; register_t retval[2]; /* Return values for processes returning from fork. */ if ((td->td_pflags & TDP_FORKING) != 0) { retval[0] = CLOUDABI_PROCESS_CHILD; retval[1] = td->td_tid; copyout(retval, (void *)frame->tf_x[2], sizeof(retval)); } frame->tf_spsr |= PSR_AARCH32; } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { struct trapframe *frame; /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; memset(frame, 0, sizeof(*frame)); frame->tf_x[0] = td->td_tid; frame->tf_x[1] = attr->argument; frame->tf_x[13] = STACKALIGN(attr->stack + attr->stack_len); frame->tf_elr = attr->entry_point; /* Set up TLS. */ return (cpu_set_user_tls(td, TO_PTR(tcb))); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = (uintmax_t)1 << 32, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_ARM, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; diff --git a/sys/arm64/cloudabi64/cloudabi64_sysvec.c b/sys/arm64/cloudabi64/cloudabi64_sysvec.c index bdbd828b7b62..624d86693457 100644 --- a/sys/arm64/cloudabi64/cloudabi64_sysvec.c +++ b/sys/arm64/cloudabi64/cloudabi64_sysvec.c @@ -1,190 +1,191 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi64_syscallnames[]; extern struct sysent cloudabi64_sysent[]; static void cloudabi64_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; exec_setregs(td, imgp, stack); /* * The stack now contains a pointer to the TCB and the auxiliary * vector. Let x0 point to the auxiliary vector, and set * tpidr_el0 to the TCB. */ regs = td->td_frame; regs->tf_x[0] = stack + roundup(sizeof(cloudabi64_tcb_t), sizeof(register_t)); (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi64_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int i; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_x[8]; + sa->original_code = sa->code; if (sa->code >= CLOUDABI64_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi64_sysent[sa->code]; /* Fetch system call arguments. */ for (i = 0; i < MAXARGS; i++) sa->args[i] = frame->tf_x[i]; /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_x[1]; return (0); } static void cloudabi64_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_x[0] = td->td_retval[0]; frame->tf_x[1] = td->td_retval[1]; frame->tf_spsr &= ~PSR_C; break; case ERESTART: /* Restart system call. */ frame->tf_elr -= 4; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_x[0] = cloudabi_convert_errno(error); frame->tf_spsr |= PSR_C; break; } } static void cloudabi64_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* * Initial register values for processes returning from fork. * Make sure that we only set these values when forking, not * when creating a new thread. */ if ((td->td_pflags & TDP_FORKING) != 0) { frame->tf_x[0] = CLOUDABI_PROCESS_CHILD; frame->tf_x[1] = td->td_tid; } } int cloudabi64_thread_setregs(struct thread *td, const cloudabi64_threadattr_t *attr, uint64_t tcb) { struct trapframe *frame; stack_t stack; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len; cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Pass in the thread ID of the new thread and the argument * pointer provided by the parent thread in as arguments to the * entry point. */ frame = td->td_frame; frame->tf_x[0] = td->td_tid; frame->tf_x[1] = attr->argument; /* Set up TLS. */ return (cpu_set_user_tls(td, TO_PTR(tcb))); } static struct sysentvec cloudabi64_elf_sysvec = { .sv_size = CLOUDABI64_SYS_MAXSYSCALL, .sv_table = cloudabi64_sysent, .sv_fixup = cloudabi64_fixup, .sv_name = "CloudABI ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf64_prepare_notes, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi64_copyout_strings, .sv_setregs = cloudabi64_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_LP64, .sv_set_syscall_retval = cloudabi64_set_syscall_retval, .sv_fetch_syscall_args = cloudabi64_fetch_syscall_args, .sv_syscallnames = cloudabi64_syscallnames, .sv_schedtail = cloudabi64_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi64_elf_sysvec); Elf64_Brandinfo cloudabi64_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_AARCH64, .sysvec = &cloudabi64_elf_sysvec, .flags = BI_CAN_EXEC_DYN | BI_BRAND_ONLY_STATIC, }; diff --git a/sys/arm64/include/proc.h b/sys/arm64/include/proc.h index bb933dc98241..3800798d79b7 100644 --- a/sys/arm64/include/proc.h +++ b/sys/arm64/include/proc.h @@ -1,70 +1,71 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * from: FreeBSD: src/sys/i386/include/proc.h,v 1.11 2001/06/29 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_daif; /* (k) */ }; struct mdproc { long md_dummy; }; #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 816 #define MAXARGS 8 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[MAXARGS]; }; #ifdef _KERNEL #include #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb); \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) #endif #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/arm64/linux/linux_sysvec.c b/sys/arm64/linux/linux_sysvec.c index 40f68a537985..e684fb604889 100644 --- a/sys/arm64/linux/linux_sysvec.c +++ b/sys/arm64/linux/linux_sysvec.c @@ -1,578 +1,579 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * Copyright (c) 2018 Turing Robotic Industries Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif MODULE_VERSION(linux64elf, 1); static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux_locore_o_start; extern char _binary_linux_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static int linux_elf_fixup(uintptr_t *stack_base, struct image_params *iparams); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_set_syscall_retval(struct thread *td, int error); static int linux_fetch_syscall_args(struct thread *td); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /* DTrace probes */ LIN_SDT_PROBE_DEFINE2(sysvec, linux_translate_traps, todo, "int", "int"); LIN_SDT_PROBE_DEFINE0(sysvec, linux_exec_setregs, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_copyout_auxargs, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_elf_fixup, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_rt_sigreturn, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_rt_sendsig, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_install, todo); LIN_SDT_PROBE_DEFINE0(sysvec, linux_vdso_deinstall, todo); /* LINUXTODO: do we have traps to translate? */ static int linux_translate_traps(int signal, int trap_code) { LIN_SDT_PROBE2(sysvec, linux_translate_traps, todo, signal, trap_code); return (signal); } LINUX_VDSO_SYM_CHAR(linux_platform); static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct syscall_args *sa; register_t *ap; p = td->td_proc; ap = td->td_frame->tf_x; sa = &td->td_sa; sa->code = td->td_frame->tf_x[8]; + sa->original_code = sa->code; /* LINUXTODO: generic syscall? */ if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; if (sa->callp->sy_narg > MAXARGS) panic("ARM64TODO: Could we have more than %d args?", MAXARGS); memcpy(sa->args, ap, MAXARGS * sizeof(register_t)); td->td_retval[0] = 0; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { td->td_retval[1] = td->td_frame->tf_x[1]; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) td->td_frame->tf_x[0] = bsd_to_linux_errno(error); } } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { Elf_Auxargs *args; Elf_Auxinfo *argarray, *pos; struct proc *p; int error, issetugid; LIN_SDT_PROBE0(sysvec, linux_copyout_auxargs, todo); p = imgp->proc; args = (Elf64_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); issetugid = p->p_flag & P_SUGID ? 1 : 0; AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, *imgp->sysent->sv_hwcap); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP2, *imgp->sysent->sv_hwcap2); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_elf_fixup(uintptr_t *stack_base, struct image_params *imgp) { LIN_SDT_PROBE0(sysvec, linux_elf_fixup, todo); return (0); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. * LINUXTODO: deduplicate against other linuxulator archs */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; int argc, envc, error; /* Calculate string base and vector table pointers. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf64_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for argc and the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= 1 + imgp->args->argc + 1 + imgp->args->envc + 1; vectp = (char **)STACKALIGN(vectp); /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); if (suword(vectp++, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Reset registers to default values on exec. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* LINUXTODO: validate */ LIN_SDT_PROBE0(sysvec, linux_exec_setregs, todo); memset(regs, 0, sizeof(*regs)); /* glibc start.S registers function pointer in x0 with atexit. */ regs->tf_sp = stack; #if 0 /* LINUXTODO: See if this is used. */ regs->tf_lr = imgp->entry_addr; #else regs->tf_lr = 0xffffffffffffffff; #endif regs->tf_elr = imgp->entry_addr; pcb->pcb_tpidr_el0 = 0; pcb->pcb_tpidrro_el0 = 0; WRITE_SPECIALREG(tpidrro_el0, 0); WRITE_SPECIALREG(tpidr_el0, 0); #ifdef VFP vfp_reset_state(td, pcb); #endif /* * Clear debug register state. It is not applicable to the new process. */ bzero(&pcb->pcb_dbg_regs, sizeof(pcb->pcb_dbg_regs)); } int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { /* LINUXTODO: implement */ LIN_SDT_PROBE0(sysvec, linux_rt_sigreturn, todo); return (EDOOFUS); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { /* LINUXTODO: implement */ LIN_SDT_PROBE0(sysvec, linux_rt_sendsig, todo); } struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_elf_fixup, .sv_sendsig = linux_rt_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF64", .sv_coredump = elf64_coredump, .sv_elf_core_osabi = ELFOSABI_NONE, .sv_elf_core_abi_vendor = LINUX_ABI_VENDOR, .sv_elf_core_prepare_notes = linux64_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, /* XXX */ .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_hwcap = &elf_hwcap, .sv_hwcap2 = &elf_hwcap2, .sv_onexec = linux_on_exec, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static void linux_vdso_install(const void *param) { linux_szsigcode = (&_binary_linux_locore_o_end - &_binary_linux_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("invalid Linux VDSO size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec); memcpy(linux_shared_page_mapping, elf_linux_sysvec.sv_sigcode, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { LIN_SDT_PROBE0(sysvec, linux_vdso_deinstall, todo); __elfN(linux_shared_page_fini)(linux_shared_page_obj, linux_shared_page_mapping); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNU_ABI_LINUX = 0; /* LINUXTODO: deduplicate */ static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNU_ABI_LINUX) return (false); *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux64_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf64_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_AARCH64, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib64/ld-linux-x86-64.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux64_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf64_Brandinfo *linux_brandlist[] = { &linux_glibc2brand, NULL }; static int linux64_elf_modevent(module_t mod, int type, void *data) { Elf64_Brandinfo **brandinfo; struct linux_ioctl_handler**lihp; int error; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux arm64 ELF exec handler installed\n"); } break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf64_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux64_elf_mod = { "linux64elf", linux64_elf_modevent, 0 }; DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); FEATURE(linux64, "AArch64 Linux 64bit support"); diff --git a/sys/i386/cloudabi32/cloudabi32_sysvec.c b/sys/i386/cloudabi32/cloudabi32_sysvec.c index 4f12d2b6cbce..e0a50f6697a9 100644 --- a/sys/i386/cloudabi32/cloudabi32_sysvec.c +++ b/sys/i386/cloudabi32/cloudabi32_sysvec.c @@ -1,206 +1,207 @@ /*- * Copyright (c) 2015-2016 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *cloudabi32_syscallnames[]; extern struct sysent cloudabi32_sysent[]; static int cloudabi32_fixup_tcb(uintptr_t *stack_base, struct image_params *imgp) { int error; uint32_t args[2]; /* Place auxiliary vector and TCB on the stack. */ error = cloudabi32_fixup(stack_base, imgp); if (error != 0) return (error); /* * On i386, the TCB is referred to by %gs:0. Reuse the empty * space normally used by the return address (args[0]) to store * a single element array, containing a pointer to the TCB. %gs * base will point to this. * * Also let the first argument of the entry point (args[1]) * refer to the auxiliary vector, which is stored right after * the TCB. */ args[0] = *stack_base; args[1] = *stack_base + roundup(sizeof(cloudabi32_tcb_t), sizeof(register_t)); *stack_base -= roundup(sizeof(args), sizeof(register_t)); return (copyout(args, (void *)*stack_base, sizeof(args))); } static void cloudabi32_proc_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { exec_setregs(td, imgp, stack); (void)cpu_set_user_tls(td, TO_PTR(stack)); } static int cloudabi32_fetch_syscall_args(struct thread *td) { struct trapframe *frame; struct syscall_args *sa; int error; frame = td->td_frame; sa = &td->td_sa; /* Obtain system call number. */ sa->code = frame->tf_eax; + sa->original_code = sa->code; if (sa->code >= CLOUDABI32_SYS_MAXSYSCALL) return (ENOSYS); sa->callp = &cloudabi32_sysent[sa->code]; /* Fetch system call arguments from the stack. */ error = copyin((void *)(frame->tf_esp + 4), sa->args, sa->callp->sy_narg * sizeof(sa->args[0])); if (error != 0) return (error); /* Default system call return values. */ td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void cloudabi32_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; switch (error) { case 0: /* System call succeeded. */ frame->tf_eax = td->td_retval[0]; frame->tf_edx = td->td_retval[1]; frame->tf_eflags &= ~PSL_C; break; case ERESTART: /* Restart system call. */ frame->tf_eip -= frame->tf_err; break; case EJUSTRETURN: break; default: /* System call returned an error. */ frame->tf_eax = cloudabi_convert_errno(error); frame->tf_eflags |= PSL_C; break; } } static void cloudabi32_schedtail(struct thread *td) { struct trapframe *frame = td->td_frame; /* Initial register values for processes returning from fork. */ frame->tf_eax = CLOUDABI_PROCESS_CHILD; frame->tf_edx = td->td_tid; } int cloudabi32_thread_setregs(struct thread *td, const cloudabi32_threadattr_t *attr, uint32_t tcb) { stack_t stack; uint32_t args[3]; void *frameptr; int error; /* Perform standard register initialization. */ stack.ss_sp = TO_PTR(attr->stack); stack.ss_size = attr->stack_len - sizeof(args); cpu_set_upcall(td, TO_PTR(attr->entry_point), NULL, &stack); /* * Copy the arguments for the thread entry point onto the stack * (args[1] and args[2]). Similar to process startup, use the * otherwise unused return address (args[0]) for TLS. */ args[0] = tcb; args[1] = td->td_tid; args[2] = attr->argument; frameptr = (void *)td->td_frame->tf_esp; error = copyout(args, frameptr, sizeof(args)); if (error != 0) return (error); return (cpu_set_user_tls(td, frameptr)); } static struct sysentvec cloudabi32_elf_sysvec = { .sv_size = CLOUDABI32_SYS_MAXSYSCALL, .sv_table = cloudabi32_sysent, .sv_fixup = cloudabi32_fixup_tcb, .sv_name = "CloudABI ELF32", .sv_coredump = elf32_coredump, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_stackprot = VM_PROT_READ | VM_PROT_WRITE, .sv_copyout_strings = cloudabi32_copyout_strings, .sv_setregs = cloudabi32_proc_setregs, .sv_flags = SV_ABI_CLOUDABI | SV_CAPSICUM | SV_IA32 | SV_ILP32, .sv_set_syscall_retval = cloudabi32_set_syscall_retval, .sv_fetch_syscall_args = cloudabi32_fetch_syscall_args, .sv_syscallnames = cloudabi32_syscallnames, .sv_schedtail = cloudabi32_schedtail, }; INIT_SYSENTVEC(elf_sysvec, &cloudabi32_elf_sysvec); Elf32_Brandinfo cloudabi32_brand = { .brand = ELFOSABI_CLOUDABI, .machine = EM_386, .sysvec = &cloudabi32_elf_sysvec, .flags = BI_BRAND_ONLY_STATIC, }; diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 045478149be5..07abac23c9da 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -1,1147 +1,1148 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #ifdef KDTRACE_HOOKS #include #endif void trap(struct trapframe *frame); void syscall(struct trapframe *frame); static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *); static void trap_fatal(struct trapframe *, vm_offset_t); #ifdef KDTRACE_HOOKS static bool trap_user_dtrace(struct trapframe *, int (**hook)(struct trapframe *)); #endif void dblfault_handler(void); extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall); extern uint64_t pg_nx; struct trap_data { bool ei; const char *msg; }; static const struct trap_data trap_data[] = { [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" }, [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" }, [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" }, [T_PROTFLT] = { .ei = true, .msg = "general protection fault" }, [T_TRCTRAP] = { .ei = false, .msg = "debug exception" }, [T_PAGEFLT] = { .ei = true, .msg = "page fault" }, [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" }, [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" }, [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" }, [T_OFLOW] = { .ei = true, .msg = "overflow trap" }, [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" }, [T_DNA] = { .ei = true, .msg = "FPU device not available" }, [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" }, [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" }, [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" }, [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" }, [T_STKFLT] = { .ei = true, .msg = "stack fault" }, [T_MCHK] = { .ei = true, .msg = "machine check trap" }, [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" }, [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" }, }; static bool trap_enable_intr(int trapno) { MPASS(trapno > 0); if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL) return (trap_data[trapno].ei); return (false); } static const char * trap_msg(int trapno) { const char *res; static const char unkn[] = "UNKNOWN"; res = NULL; if (trapno < nitems(trap_data)) res = trap_data[trapno].msg; if (res == NULL) res = unkn; return (res); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { ksiginfo_t ksi; struct thread *td; struct proc *p; int pf, signo, ucode; u_int type; register_t addr, dr6; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif td = curthread; p = td->td_proc; dr6 = 0; VM_CNT_INC(v_trap); type = frame->tf_trapno; KASSERT((read_eflags() & PSL_I) == 0, ("trap: interrupts enabled, type %d frame %p", type, frame)); #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI && ipi_nmi_handler() == 0) return; #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); return; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A non-zero return value from the * hook means that the NMI was consumed by it and that we can * return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(frame) != 0) return; #endif } if (type == T_MCHK) { mca_intr(); return; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) return; #endif /* * We must not allow context switches until %cr2 is read. * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts. * All faults use interrupt gates, so %cr2 can be safely read * now, before optional enable of the interrupts below. */ if (type == T_PAGEFLT) eva = rcr2(); /* * Buggy application or kernel code has disabled interrupts * and then trapped. Enabling interrupts now is wrong, but it * is better than running with interrupts disabled until they * are accidentally enabled later. */ if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) uprintf("pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); /* * Conditionally reenable interrupts. If we hold a spin lock, * then we must not reenable interrupts. This might be a * spurious page fault. */ if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 && frame->tf_eip != (int)cpu_switch_load_gs) enable_intr(); if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ signo = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ #ifdef KDTRACE_HOOKS if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr)) return; #else enable_intr(); #endif signo = SIGTRAP; ucode = TRAP_BRKPT; break; case T_TRCTRAP: /* debug exception */ enable_intr(); user_trctrap_out: signo = SIGTRAP; ucode = TRAP_TRACE; dr6 = rdr6(); if ((dr6 & DBREG_DR6_BS) != 0) { PROC_LOCK(td->td_proc); if ((td->td_dbgflags & TDB_STEP) != 0) { td->td_frame->tf_eflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; } PROC_UNLOCK(td->td_proc); } break; case T_ARITHTRAP: /* arithmetic trap */ ucode = npxtrap_x87(); if (ucode == -1) return; signo = SIGFPE; break; /* * The following two traps can happen in vm86 mode, * and, if so, we want to handle them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { signo = vm86_emulate((struct vm86frame *)frame); ucode = 0; /* XXXKIB: better code ? */ if (signo == SIGTRAP) { load_dr6(rdr6() | 0x4000); goto user_trctrap_out; } if (signo == 0) goto user; break; } signo = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ signo = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: signo = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: signo = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ addr = eva; pf = trap_pfault(frame, true, eva, &signo, &ucode); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (pf == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; break; } #endif if (pf == -1) return; if (pf == 0) goto user; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; signo = SIGFPE; break; case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } return; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); return; #endif /* POWERFAIL_NMI */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; signo = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; signo = SIGFPE; break; case T_DNA: KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) return; uprintf("pid %d killed due to lack of floating point\n", p->p_pid); signo = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; signo = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = npxtrap_sse(); if (ucode == -1) return; signo = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr); return; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void)trap_pfault(frame, false, eva, NULL, NULL); return; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); if (npxdna()) return; break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); return; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { signo = vm86_emulate((struct vm86frame *)frame); if (signo == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto kernel_trctrap; } if (signo != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); return; } /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif return; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. * * N.B. Comparing to long mode, 32-bit mode * does not push %esp on the trap frame, * because iretl faulted while in ring 0. As * the consequence, there is no need to fixup * the stack pointer for doreti_iret_fault, * the fixup and the complimentary trap() call * are executed on the main thread stack, not * on the trampoline stack. */ if (frame->tf_eip == (int)doreti_iret + setidt_disp) { frame->tf_eip = (int)doreti_iret_fault + setidt_disp; return; } if (type == T_STKFLT) break; if (frame->tf_eip == (int)doreti_popl_ds + setidt_disp) { frame->tf_eip = (int)doreti_popl_ds_fault + setidt_disp; return; } if (frame->tf_eip == (int)doreti_popl_es + setidt_disp) { frame->tf_eip = (int)doreti_popl_es_fault + setidt_disp; return; } if (frame->tf_eip == (int)doreti_popl_fs + setidt_disp) { frame->tf_eip = (int)doreti_popl_fs_fault + setidt_disp; return; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* debug exception */ kernel_trctrap: /* Clear any pending debug events. */ dr6 = rdr6(); load_dr6(0); /* * Ignore debug register exceptions due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap(dr6) && !(curpcb->pcb_flags & PCB_VM86CALL)) return; /* * Malicious user code can configure a debug * register watchpoint to trap on data access * to the top of stack and then execute 'pop * %ss; int 3'. Due to exception deferral for * 'pop %ss', the CPU will not interrupt 'int * 3' to raise the DB# exception for the debug * register but will postpone the DB# until * execution of the first instruction of the * BP# handler (in kernel mode). Normally the * previous check would ignore DB# exceptions * for watchpoints on user addresses raised in * kernel mode. However, some CPU errata * include cases where DB# exceptions do not * properly set bits in %dr6, e.g. Haswell * HSD23 and Skylake-X SKZ24. * * A deferred DB# can also be raised on the * first instructions of system call entry * points or single-step traps via similar use * of 'pop %ss' or 'mov xxx, %ss'. */ if (frame->tf_eip == (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp || frame->tf_eip == (uintptr_t)IDTVEC(bpt) + setidt_disp || frame->tf_eip == (uintptr_t)IDTVEC(dbg) + setidt_disp) return; /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, dr6, frame)) return; #endif break; case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } return; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); return; #endif /* POWERFAIL_NMI */ } trap_fatal(frame, eva); return; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap != NULL) signo = (*p->p_sysent->sv_transtrap)(signo, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, addr, frame->tf_ss, frame->tf_esp, frame->tf_cs, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); } /* * Handle all details of a page fault. * Returns: * -2 if the fault was caused by triggered workaround for Intel Pentium * 0xf00f bug. * -1 if this fault was fatal, typically from kernel mode * (cannot happen, but we need to return something). * 0 if this fault was handled by updating either the user or kernel * page table, execution can continue. * 1 if this fault was from usermode and it was not handled, a synchronous * signal should be delivered to the thread. *signo returns the signal * number, *ucode gives si_code. */ static int trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva, int *signo, int *ucode) { struct thread *td; struct proc *p; vm_map_t map; int rv; vm_prot_t ftype; MPASS(!usermode || (signo != NULL && ucode != NULL)); td = curthread; p = td->td_proc; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } if (eva >= PMAP_TRM_MIN_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { *ucode = ILL_PRVOPC; *signo = SIGILL; return (-2); } #endif if (usermode) { *signo = SIGSEGV; *ucode = SEGV_MAPERR; return (1); } trap_fatal(frame, eva); return (-1); } else { map = usermode ? &p->p_vmspace->vm_map : kernel_map; /* * Kernel cannot access a user-space address directly * because user pages are not mapped. Also, page * faults must not be caused during the interrupts. */ if (!usermode && td->td_intr_nesting_level != 0) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } if (usermode) return (1); if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } static void trap_fatal(struct trapframe *frame, vm_offset_t eva) { int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; #ifdef KDB bool handled; #endif code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type), frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", pg_nx != 0 ? (code & PGEX_I ? " instruction" : " data") : "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } else { printf("error code = %#x\n", code); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if (TF_HAS_STACKREGS(frame)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; frame->tf_err = eva; /* smuggle fault address to ddb */ handled = kdb_trap(type, 0, frame); frame->tf_err = code; /* restore error code */ kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif printf("trap number = %d\n", type); if (trap_msg(type) != NULL) panic("%s", trap_msg(type)); else panic("unknown/reserved trap"); } #ifdef KDTRACE_HOOKS /* * Invoke a userspace DTrace hook. The hook pointer is cleared when no * userspace probes are enabled, so we must synchronize with DTrace to ensure * that a trapping thread is able to call the hook before it is cleared. */ static bool trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *)) { int (*hook)(struct trapframe *); hook = atomic_load_ptr(hookp); enable_intr(); if (hook != NULL) return ((hook)(frame) == 0); return (false); } #endif /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler(void) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip); printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp); printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; caddr_t params; long tmp; int error; #ifdef COMPAT_43 u_int32_t eip; int cs; #endif p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; #ifdef COMPAT_43 if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) { /* * In lcall $7,$0 after int $0x80. Convert the user * frame to what it would be for a direct int 0x80 instead * of lcall $7,$0, by popping the lcall return address. */ error = fueword32((void *)frame->tf_esp, &eip); if (error == -1) return (EFAULT); cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t))); if (cs == -1) return (EFAULT); /* * Unwind in-kernel frame after all stack frame pieces * were successfully read. */ frame->tf_eip = eip; frame->tf_cs = cs; frame->tf_esp += 2 * sizeof(u_int32_t); frame->tf_err = 7; /* size of lcall $7,$0 */ } #endif sa->code = frame->tf_eax; + sa->original_code = sa->code; params = (caddr_t)frame->tf_esp + sizeof(uint32_t); /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(uint32_t); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; if (params != NULL && sa->callp->sy_narg != 0) error = copyin(params, (caddr_t)sa->args, (u_int)(sa->callp->sy_narg * sizeof(uint32_t))); else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; } return (error); } #include "../../kern/subr_syscall.c" /* * syscall - system call request C handler. A system call is * essentially treated as a trap by reusing the frame layout. */ void syscall(struct trapframe *frame) { struct thread *td; register_t orig_tf_eflags; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!(TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { panic("syscall"); /* NOT REACHED */ } #endif orig_tf_eflags = frame->tf_eflags; td = curthread; td->td_frame = frame; syscallenter(td); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame->tf_eflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); syscallret(td); } diff --git a/sys/i386/include/proc.h b/sys/i386/include/proc.h index 2950946ff155..76e1ac611474 100644 --- a/sys/i386/include/proc.h +++ b/sys/i386/include/proc.h @@ -1,90 +1,91 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ #include struct proc_ldt { caddr_t ldt_base; int ldt_len; int ldt_refcnt; u_long ldt_active; struct segment_descriptor ldt_sd; }; /* * Machine-dependent part of the proc structure for i386. * Table of MD locks: * t - Descriptor tables lock */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ register_t md_spurflt_addr; /* (k) Spurious page fault address. */ }; struct mdproc { struct proc_ldt *md_ldt; /* (t) per-process ldt */ }; #define KINFO_PROC_SIZE 768 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[8]; }; #ifdef _KERNEL /* Get the current kernel thread stack usage. */ #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE; \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) void set_user_ldt(struct mdproc *); struct proc_ldt *user_ldt_alloc(struct mdproc *, int); void user_ldt_free(struct thread *); void user_ldt_deref(struct proc_ldt *pldt); extern struct mtx dt_lock; #endif /* _KERNEL */ #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index adb70fded6dc..a0959d55b585 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -1,1063 +1,1064 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_shared_page_obj; static char *linux_shared_page_mapping; extern char _binary_linux_locore_o_start; extern char _binary_linux_locore_o_end; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(uintptr_t *stack_base, struct image_params *iparams); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(void *param); static void linux_vdso_deinstall(void *param); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)args->argc + 1); base--; suword(base, (intptr_t)envp); base--; suword(base, (intptr_t)argv); base--; suword(base, imgp->args->argc); *stack_base = (uintptr_t)base; return (0); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { struct proc *p; Elf32_Auxargs *args; Elf32_Auxinfo *argarray, *pos; struct ps_strings *arginfo; int error, issetugid; p = imgp->proc; issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; args = (Elf32_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, imgp->proc->p_sysent->sv_shared_page_base); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { register_t *base; base = (register_t *)*stack_base; base--; if (suword(base, (register_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copied from kern/kern_exec.c */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; /* Calculate string base and vector table pointers. */ p = imgp->proc; if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; destp = (uintptr_t)arginfo; if (execpath_len != 0) { destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp; frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = linux_rt_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig, code; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = ksi->ksi_signo; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_esp = regs->tf_esp; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = linux_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; sigset_t bmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* Call sigaltstack & ignore results. */ lss = &uc.uc_stack; ss.ss_sp = lss->ss_sp; ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_eax; + sa->original_code = sa->code; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_eax = bsd_to_linux_errno(error); } } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel. */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32 | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_locore_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static void linux_vdso_install(void *param) { linux_szsigcode = (&_binary_linux_locore_o_end - &_binary_linux_locore_o_start); if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) panic("Linux invalid vdso size\n"); __elfN(linux_vdso_fixup)(&elf_linux_sysvec); linux_shared_page_obj = __elfN(linux_shared_page_init) (&linux_shared_page_mapping); __elfN(linux_vdso_reloc)(&elf_linux_sysvec); bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, linux_szsigcode); elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, linux_vdso_install, NULL); static void linux_vdso_deinstall(void *param) { __elfN(linux_shared_page_fini)(linux_shared_page_obj, linux_shared_page_mapping); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); LIST_INIT(&futex_list); mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_dev_shm_create(); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); mtx_destroy(&futex_mtx); linux_dev_shm_destroy(); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); FEATURE(linux, "Linux 32bit support"); diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 7d971d295cba..328a69bc5f23 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1,1764 +1,1764 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0xfc, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x4a0, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xc4, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3b8, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3d0, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x98, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x300, +_Static_assert(offsetof(struct thread, td_frame) == 0x304, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x344, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x348, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x6c, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x78, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x268, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x27c, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x308, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; struct thread_domain_data { struct thread *tdd_zombies; int tdd_reapticks; } __aligned(CACHE_LINE_SIZE); static struct thread_domain_data thread_domain_data[MAXMEMDOM]; static struct task thread_reap_task; static struct callout thread_reap_callout; static void thread_zombie(struct thread *); static void thread_reap(void); static void thread_reap_all(void); static void thread_reap_task_cb(void *, int); static void thread_reap_callout_cb(void *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); static void thread_free_batched(struct thread *td); static __exclusive_cache_line struct mtx tid_lock; static bitstr_t *tid_bitmap; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); static int maxthread; SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN, &maxthread, 0, "Maximum number of threads"); static __exclusive_cache_line int nthreads; static LIST_HEAD(tidhashhead, thread) *tidhashtbl; static u_long tidhash; static u_long tidhashlock; static struct rwlock *tidhashtbl_lock; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) #define TIDHASHLOCK(tid) (&tidhashtbl_lock[(tid) & tidhashlock]) EVENTHANDLER_LIST_DEFINE(thread_ctor); EVENTHANDLER_LIST_DEFINE(thread_dtor); EVENTHANDLER_LIST_DEFINE(thread_init); EVENTHANDLER_LIST_DEFINE(thread_fini); static bool thread_count_inc_try(void) { int nthreads_new; nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1; if (nthreads_new >= maxthread - 100) { if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 || nthreads_new >= maxthread) { atomic_subtract_int(&nthreads, 1); return (false); } } return (true); } static bool thread_count_inc(void) { static struct timeval lastfail; static int curfail; thread_reap(); if (thread_count_inc_try()) { return (true); } thread_reap_all(); if (thread_count_inc_try()) { return (true); } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxthread limit exceeded by uid %u " "(pid %d); consider increasing kern.maxthread\n", curthread->td_ucred->cr_ruid, curproc->p_pid); } return (false); } static void thread_count_sub(int n) { atomic_subtract_int(&nthreads, n); } static void thread_count_dec(void) { thread_count_sub(1); } static lwpid_t tid_alloc(void) { static lwpid_t trytid; lwpid_t tid; mtx_lock(&tid_lock); /* * It is an invariant that the bitmap is big enough to hold maxthread * IDs. If we got to this point there has to be at least one free. */ if (trytid >= maxthread) trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); if (tid == -1) { KASSERT(trytid != 0, ("unexpectedly ran out of IDs")); trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); KASSERT(tid != -1, ("unexpectedly ran out of IDs")); } bit_set(tid_bitmap, tid); trytid = tid + 1; mtx_unlock(&tid_lock); return (tid + NO_PID); } static void tid_free_locked(lwpid_t rtid) { lwpid_t tid; mtx_assert(&tid_lock, MA_OWNED); KASSERT(rtid >= NO_PID, ("%s: invalid tid %d\n", __func__, rtid)); tid = rtid - NO_PID; KASSERT(bit_test(tid_bitmap, tid) != 0, ("thread ID %d not allocated\n", rtid)); bit_clear(tid_bitmap, tid); } static void tid_free(lwpid_t rtid) { mtx_lock(&tid_lock); tid_free_locked(rtid); mtx_unlock(&tid_lock); } static void tid_free_batch(lwpid_t *batch, int n) { int i; mtx_lock(&tid_lock); for (i = 0; i < n; i++) { tid_free_locked(batch[i]); } mtx_unlock(&tid_lock); } /* * Batching for thread reapping. */ struct tidbatch { lwpid_t tab[16]; int n; }; static void tidbatch_prep(struct tidbatch *tb) { tb->n = 0; } static void tidbatch_add(struct tidbatch *tb, struct thread *td) { KASSERT(tb->n < nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); tb->tab[tb->n] = td->td_tid; tb->n++; } static void tidbatch_process(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n == nitems(tb->tab)) { tid_free_batch(tb->tab, tb->n); tb->n = 0; } } static void tidbatch_final(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n != 0) { tid_free_batch(tb->tab, tb->n); } } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; TD_SET_STATE(td, TDS_INACTIVE); td->td_lastcpu = td->td_oncpu = NOCPU; /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; #ifdef AUDIT audit_thread_alloc(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_ctor(td); #endif umtx_thread_alloc(td); MPASS(td->td_sel == NULL); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (TD_GET_STATE(td)) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_dtor(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); seltdfini(td); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_allocdomain = vm_phys_domain(vtophys(td)); td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_DIRECT_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_DIRECT_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); MPASS(td->td_sel == NULL); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } extern int max_threads_per_proc; /* * Initialize global thread allocation resources. */ void threadinit(void) { u_long i; lwpid_t tid0; uint32_t flags; /* * Place an upper limit on threads which can be allocated. * * Note that other factors may make the de facto limit much lower. * * Platform limits are somewhat arbitrary but deemed "more than good * enough" for the foreseable future. */ if (maxthread == 0) { #ifdef _LP64 maxthread = MIN(maxproc * max_threads_per_proc, 1000000); #else maxthread = MIN(maxproc * max_threads_per_proc, 100000); #endif } mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK); /* * Handle thread0. */ thread_count_inc(); tid0 = tid_alloc(); if (tid0 != THREAD0_TID) panic("tid0 %d != %d\n", tid0, THREAD0_TID); flags = UMA_ZONE_NOFREE; #ifdef __aarch64__ /* * Force thread structures to be allocated from the direct map. * Otherwise, superpage promotions and demotions may temporarily * invalidate thread structure mappings. For most dynamically allocated * structures this is not a problem, but translation faults cannot be * handled without accessing curthread. */ flags |= UMA_ZONE_CONTIG; #endif thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, flags); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); tidhashlock = (tidhash + 1) / 64; if (tidhashlock > 0) tidhashlock--; tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1), M_TIDHASH, M_WAITOK | M_ZERO); for (i = 0; i < tidhashlock + 1; i++) rw_init(&tidhashtbl_lock[i], "tidhash"); TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL); callout_init(&thread_reap_callout, 1); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Place an unused thread on the zombie list. */ void thread_zombie(struct thread *td) { struct thread_domain_data *tdd; struct thread *ztd; tdd = &thread_domain_data[td->td_allocdomain]; ztd = atomic_load_ptr(&tdd->tdd_zombies); for (;;) { td->td_zombie = ztd; if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t *)&ztd, (uintptr_t)td)) break; continue; } } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombies from passed domain. */ static void thread_reap_domain(struct thread_domain_data *tdd) { struct thread *itd, *ntd; struct tidbatch tidbatch; struct credbatch credbatch; int tdcount; struct plimit *lim; int limcount; /* * Reading upfront is pessimal if followed by concurrent atomic_swap, * but most of the time the list is empty. */ if (tdd->tdd_zombies == NULL) return; itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t)NULL); if (itd == NULL) return; /* * Multiple CPUs can get here, the race is fine as ticks is only * advisory. */ tdd->tdd_reapticks = ticks; tidbatch_prep(&tidbatch); credbatch_prep(&credbatch); tdcount = 0; lim = NULL; limcount = 0; while (itd != NULL) { ntd = itd->td_zombie; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd); tidbatch_add(&tidbatch, itd); credbatch_add(&credbatch, itd); MPASS(itd->td_limit != NULL); if (lim != itd->td_limit) { if (limcount != 0) { lim_freen(lim, limcount); limcount = 0; } } lim = itd->td_limit; limcount++; thread_free_batched(itd); tidbatch_process(&tidbatch); credbatch_process(&credbatch); tdcount++; if (tdcount == 32) { thread_count_sub(tdcount); tdcount = 0; } itd = ntd; } tidbatch_final(&tidbatch); credbatch_final(&credbatch); if (tdcount != 0) { thread_count_sub(tdcount); } MPASS(limcount != 0); lim_freen(lim, limcount); } /* * Reap zombies from all domains. */ static void thread_reap_all(void) { struct thread_domain_data *tdd; int i, domain; domain = PCPU_GET(domain); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[(i + domain) % vm_ndomains]; thread_reap_domain(tdd); } } /* * Reap zombies from local domain. */ static void thread_reap(void) { struct thread_domain_data *tdd; int domain; domain = PCPU_GET(domain); tdd = &thread_domain_data[domain]; thread_reap_domain(tdd); } static void thread_reap_task_cb(void *arg __unused, int pending __unused) { thread_reap_all(); } static void thread_reap_callout_cb(void *arg __unused) { struct thread_domain_data *tdd; int i, cticks, lticks; bool wantreap; wantreap = false; cticks = atomic_load_int(&ticks); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[i]; lticks = tdd->tdd_reapticks; if (tdd->tdd_zombies != NULL && (u_int)(cticks - lticks) > 5 * hz) { wantreap = true; break; } } if (wantreap) taskqueue_enqueue(taskqueue_thread, &thread_reap_task); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Calling this function guarantees that any thread that exited before * the call is reaped when the function returns. By 'exited' we mean * a thread removed from the process linkage with thread_unlink(). * Practically this means that caller must lock/unlock corresponding * process lock before the call, to synchronize with thread_exit(). */ void thread_reap_barrier(void) { struct task *t; /* * First do context switches to each CPU to ensure that all * PCPU pc_deadthreads are moved to zombie list. */ quiesce_all_cpus("", PDROP); /* * Second, fire the task in the same thread as normal * thread_reap() is done, to serialize reaping. */ t = malloc(sizeof(*t), M_TEMP, M_WAITOK); TASK_INIT(t, 0, thread_reap_task_cb, t); taskqueue_enqueue(taskqueue_thread, t); taskqueue_drain(taskqueue_thread, t); free(t, M_TEMP); } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; lwpid_t tid; if (!thread_count_inc()) { return (NULL); } tid = tid_alloc(); td = uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); tid_free(tid); thread_count_dec(); return (NULL); } td->td_tid = tid; cpu_thread_alloc(td); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ static void thread_free_batched(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); callout_drain(&td->td_slpcallout); /* * Freeing handled by the caller. */ td->td_tid = -1; uma_zfree(thread_zone, td); } void thread_free(struct thread *td) { lwpid_t tid; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td); tid = td->td_tid; thread_free_batched(td); tid_free(tid); thread_count_dec(); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_realucred = crcowget(p->p_ucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { MPASS(td->td_realucred == td->td_ucred); newtd->td_realucred = crcowget(td->td_realucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_realucred != NULL) crcowfree(td); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldlimit = NULL; PROC_LOCK(p); oldcred = crcowsync(); if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); MPASS(td->td_realucred == td->td_ucred); /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) { PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL); } else if (PMC_SYSTEM_SAMPLING_ACTIVE()) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg_locked(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); TD_SET_STATE(td, TDS_INACTIVE); #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ TD_SET_STATE(td, TDS_INACTIVE); td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); #ifdef EPOCH_TRACE SLIST_INIT(&td->td_epochs); #endif sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef EPOCH_TRACE MPASS(SLIST_EMPTY(&td->td_epochs)); #endif TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; /* * Since the thread lock is dropped by the scheduler we have * to retry to check for races. */ restart: switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) { wakeup_swapper |= thread_unsuspend_one(td2, p, true); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, EINTR); return (wakeup_swapper); } break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } } break; default: break; } thread_unlock(td2); return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); thread_unlock(td2); #endif } else thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND); PROC_LOCK(p); } return (0); } /* * Check for possible stops and suspensions while executing a * casueword or similar transiently failing operation. * * The sleep argument controls whether the function can handle a stop * request itself or it should return ERESTART and the request is * proceed at the kernel/user boundary in ast. * * Typically, when retrying due to casueword(9) failure (rv == 1), we * should handle the stop requests there, with exception of cases when * the thread owns a kernel resource, for instance busied the umtx * key, or when functions return immediately if thread_check_susp() * returned non-zero. On the other hand, retrying the whole lock * operation, we better not stop there but delegate the handling to * ast. * * If the request is for thread termination P_SINGLE_EXIT, we cannot * handle it at all, and simply return EINTR. */ int thread_check_susp(struct thread *td, bool sleep) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) error = sleep ? thread_suspend_check(0) : ERESTART; PROC_UNLOCK(p); return (error); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td, 0)); } void thread_run_flash(struct thread *td) { struct proc *p; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (TD_ON_SLEEPQ(td)) sleepq_remove_nested(td); else thread_lock(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); PROC_SLOCK(p); MPASS(p->p_suspcount > 0); p->p_suspcount--; PROC_SUNLOCK(p); if (setrunnable(td, 0)) kick_proc0(); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } else thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } else thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } /* * Locate a thread by number and return with proc lock held. * * thread exit establishes proc -> tidhash lock ordering, but lookup * takes tidhash first and needs to return locked proc. * * The problem is worked around by relying on type-safety of both * structures and doing the work in 2 steps: * - tidhash-locked lookup which saves both thread and proc pointers * - proc-locked verification that the found thread still matches */ static bool tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp) { #define RUN_THRESH 16 struct proc *p; struct thread *td; int run; bool locked; run = 0; rw_rlock(TIDHASHLOCK(tid)); locked = true; LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid != tid) { run++; continue; } p = td->td_proc; if (pid != -1 && p->p_pid != pid) { td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(TIDHASHLOCK(tid))) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(tid)); locked = false; break; } } break; } if (locked) rw_runlock(TIDHASHLOCK(tid)); if (td == NULL) return (false); *pp = p; *tdp = td; return (true); } struct thread * tdfind(lwpid_t tid, pid_t pid) { struct proc *p; struct thread *td; td = curthread; if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) return (NULL); PROC_LOCK(td->td_proc); return (td); } for (;;) { if (!tdfind_hash(tid, pid, &p, &td)) return (NULL); PROC_LOCK(p); if (td->td_tid != tid) { PROC_UNLOCK(p); continue; } if (td->td_proc != p) { PROC_UNLOCK(p); continue; } if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); return (NULL); } return (td); } } void tidhash_add(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); } void tidhash_remove(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_REMOVE(td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); } diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index 85a0814a2125..2304e3e7f3f9 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -1,274 +1,275 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2010 Konstantin Belousov * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include "opt_capsicum.h" #include "opt_ktrace.h" __FBSDID("$FreeBSD$"); #include #include #include #ifdef KTRACE #include #include #endif #include static inline void syscallenter(struct thread *td) { struct proc *p; struct syscall_args *sa; struct sysent *se; int error, traced; bool sy_thr_static; VM_CNT_INC(v_syscall); p = td->td_proc; sa = &td->td_sa; td->td_pticks = 0; if (__predict_false(td->td_cowgen != p->p_cowgen)) thread_cow_update(td); traced = (p->p_flag & P_TRACED) != 0; if (__predict_false(traced || td->td_dbgflags & TDB_USERWR)) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_USERWR; if (traced) td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } error = (p->p_sysent->sv_fetch_syscall_args)(td); se = sa->callp; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, se->sy_narg, sa->args); #endif KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0], "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]); if (__predict_false(error != 0)) { td->td_errno = error; goto retval; } if (__predict_false(traced)) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_SCE) ptracestop((td), SIGTRAP, NULL); PROC_UNLOCK(p); if ((td->td_dbgflags & TDB_USERWR) != 0) { /* * Reread syscall number and arguments if debugger * modified registers or memory. */ error = (p->p_sysent->sv_fetch_syscall_args)(td); se = sa->callp; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, se->sy_narg, sa->args); #endif if (error != 0) { td->td_errno = error; goto retval; } } } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if (__predict_false(IN_CAPABILITY_MODE(td) && (se->sy_flags & SYF_CAPENABLED) == 0)) { td->td_errno = error = ECAPMODE; goto retval; } #endif /* * Fetch fast sigblock value at the time of syscall entry to * handle sleepqueue primitives which might call cursig(). */ if (__predict_false(sigfastblock_fetch_always)) (void)sigfastblock_fetch(td); /* Let system calls set td_errno directly. */ KASSERT((td->td_pflags & TDP_NERRNO) == 0, ("%s: TDP_NERRNO set", __func__)); sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; if (__predict_false(SYSTRACE_ENABLED() || AUDIT_SYSCALL_ENTER(sa->code, td) || !sy_thr_static)) { if (!sy_thr_static) { error = syscall_thread_enter(td, se); if (error != 0) { td->td_errno = error; goto retval; } } #ifdef KDTRACE_HOOKS /* Give the syscall:::entry DTrace probe a chance to fire. */ if (__predict_false(se->sy_entry != 0)) (*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0); #endif error = (se->sy_call)(td, sa->args); /* Save the latest error return value. */ if (__predict_false((td->td_pflags & TDP_NERRNO) != 0)) td->td_pflags &= ~TDP_NERRNO; else td->td_errno = error; /* * Note that some syscall implementations (e.g., sys_execve) * will commit the audit record just before their final return. * These were done under the assumption that nothing of interest * would happen between their return and here, where we would * normally commit the audit record. These assumptions will * need to be revisited should any substantial logic be added * above. */ AUDIT_SYSCALL_EXIT(error, td); #ifdef KDTRACE_HOOKS /* Give the syscall:::return DTrace probe a chance to fire. */ if (__predict_false(se->sy_return != 0)) (*systrace_probe_func)(sa, SYSTRACE_RETURN, error ? -1 : td->td_retval[0]); #endif if (!sy_thr_static) syscall_thread_exit(td, se); } else { error = (se->sy_call)(td, sa->args); /* Save the latest error return value. */ if (__predict_false((td->td_pflags & TDP_NERRNO) != 0)) td->td_pflags &= ~TDP_NERRNO; else td->td_errno = error; } retval: KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error, "retval0:%#lx", td->td_retval[0], "retval1:%#lx", td->td_retval[1]); if (__predict_false(traced)) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); } static inline void syscallret(struct thread *td) { struct proc *p; struct syscall_args *sa; ksiginfo_t ksi; int traced; KASSERT((td->td_pflags & TDP_FORKING) == 0, ("fork() did not clear TDP_FORKING upon completion")); KASSERT(td->td_errno != ERELOOKUP, ("ERELOOKUP not consumed syscall %d", td->td_sa.code)); p = td->td_proc; sa = &td->td_sa; if (__predict_false(td->td_errno == ENOTCAPABLE || td->td_errno == ECAPMODE)) { if ((trap_enotcap || (p->p_flag2 & P2_TRAPCAP) != 0) && IN_CAPABILITY_MODE(td)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_errno = td->td_errno; ksi.ksi_code = TRAP_CAP; + ksi.ksi_info.si_syscall = sa->original_code; trapsignal(td, &ksi); } } /* * Handle reschedule and other end-of-syscall issues */ userret(td, td->td_frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) { ktrsysret(sa->code, td->td_errno, td->td_retval[0]); } #endif traced = 0; if (__predict_false(p->p_flag & P_TRACED)) { traced = 1; PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; PROC_UNLOCK(p); } if (__predict_false(traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0)) { PROC_LOCK(p); /* * If tracing the execed process, trap to the debugger * so that breakpoints can be set before the program * executes. If debugger requested tracing of syscall * returns, do it now too. */ if (traced && ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 || (p->p_ptevents & PTRACE_SCX) != 0)) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK); PROC_UNLOCK(p); } if (__predict_false(td->td_pflags & TDP_RFPPWAIT)) fork_rfppwait(td); } diff --git a/sys/mips/include/proc.h b/sys/mips/include/proc.h index 0cb1d433387c..29d832a162e6 100644 --- a/sys/mips/include/proc.h +++ b/sys/mips/include/proc.h @@ -1,98 +1,99 @@ /* $OpenBSD: proc.h,v 1.2 1998/09/15 10:50:12 pefo Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.1 (Berkeley) 6/10/93 * JNPR: proc.h,v 1.7.2.1 2007/09/10 06:25:24 girish * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ #ifdef CPU_CNMIPS #include #endif /* * Machine-dependent part of the proc structure. */ struct mdthread { int md_flags; /* machine-dependent flags */ #if defined(__mips_n64) || defined(__mips_n32) /* PHYSADDR_64_BIT */ uint64_t md_upte[KSTACK_PAGES]; /* ptes for mapping u pcb */ #else int md_upte[KSTACK_PAGES]; #endif uintptr_t md_ss_addr; /* single step address for ptrace */ int md_ss_instr; /* single step instruction for ptrace */ register_t md_saved_intr; u_int md_spinlock_count; /* The following is CPU dependent, but kept in for compatibility */ int md_pc_ctrl; /* performance counter control */ int md_pc_count; /* performance counter */ int md_pc_spill; /* performance counter spill */ void *md_tls; #ifdef CPU_CNMIPS struct octeon_cop2_state *md_cop2; /* kernel context */ struct octeon_cop2_state *md_ucop2; /* userland context */ #define COP2_OWNER_USERLAND 0x0000 /* Userland owns COP2 */ #define COP2_OWNER_KERNEL 0x0001 /* Kernel owns COP2 */ int md_cop2owner; #endif }; /* md_flags */ #define MDTD_FPUSED 0x0001 /* Process used the FPU */ #define MDTD_COP2USED 0x0002 /* Process used the COP2 */ struct mdproc { size_t md_tls_tcb_offset; /* TCB offset */ }; #define MAXARGS 8 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[MAXARGS]; }; #ifdef __mips_n64 #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 816 #else #define KINFO_PROC_SIZE 816 #endif #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/mips/mips/trap.c b/sys/mips/mips/trap.c index 96a2de4ee817..9d7a07606373 100644 --- a/sys/mips/mips/trap.c +++ b/sys/mips/mips/trap.c @@ -1,1700 +1,1701 @@ /* $OpenBSD: trap.c,v 1.19 1998/09/30 12:40:41 pefo Exp $ */ /* tracked to 1.23 */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and Ralph Campbell. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah Hdr: trap.c 1.32 91/04/06 * * from: @(#)trap.c 8.5 (Berkeley) 1/11/94 * JNPR: trap.c,v 1.13.2.2 2007/08/29 10:03:49 girish */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #include #endif #ifdef KDTRACE_HOOKS #include #endif #ifdef TRAP_DEBUG int trap_debug = 0; SYSCTL_INT(_machdep, OID_AUTO, trap_debug, CTLFLAG_RW, &trap_debug, 0, "Debug information on all traps"); #endif #define lbu_macro(data, addr) \ __asm __volatile ("lbu %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lb_macro(data, addr) \ __asm __volatile ("lb %0, 0x0(%1)" \ : "=r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lwl_macro(data, addr) \ __asm __volatile ("lwl %0, 0x0(%1)" \ : "+r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define lwr_macro(data, addr) \ __asm __volatile ("lwr %0, 0x0(%1)" \ : "+r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define ldl_macro(data, addr) \ __asm __volatile ("ldl %0, 0x0(%1)" \ : "+r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define ldr_macro(data, addr) \ __asm __volatile ("ldr %0, 0x0(%1)" \ : "+r" (data) /* outputs */ \ : "r" (addr)); /* inputs */ #define sb_macro(data, addr) \ __asm __volatile ("sb %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define swl_macro(data, addr) \ __asm __volatile ("swl %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define swr_macro(data, addr) \ __asm __volatile ("swr %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define sdl_macro(data, addr) \ __asm __volatile ("sdl %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ #define sdr_macro(data, addr) \ __asm __volatile ("sdr %0, 0x0(%1)" \ : /* outputs */ \ : "r" (data), "r" (addr)); /* inputs */ static void log_illegal_instruction(const char *, struct trapframe *); static void log_bad_page_fault(char *, struct trapframe *, int); static void log_frame_dump(struct trapframe *frame); static void get_mapping_info(vm_offset_t, pd_entry_t **, pt_entry_t **); int (*dtrace_invop_jump_addr)(struct trapframe *); #ifdef TRAP_DEBUG static void trap_frame_dump(struct trapframe *frame); #endif void (*machExceptionTable[]) (void)= { /* * The kernel exception handlers. */ MipsKernIntr, /* external interrupt */ MipsKernGenException, /* TLB modification */ MipsTLBInvalidException,/* TLB miss (load or instr. fetch) */ MipsTLBInvalidException,/* TLB miss (store) */ MipsKernGenException, /* address error (load or I-fetch) */ MipsKernGenException, /* address error (store) */ MipsKernGenException, /* bus error (I-fetch) */ MipsKernGenException, /* bus error (load or store) */ MipsKernGenException, /* system call */ MipsKernGenException, /* breakpoint */ MipsKernGenException, /* reserved instruction */ MipsKernGenException, /* coprocessor unusable */ MipsKernGenException, /* arithmetic overflow */ MipsKernGenException, /* trap exception */ MipsKernGenException, /* virtual coherence exception inst */ MipsKernGenException, /* floating point exception */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* watch exception */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* reserved */ MipsKernGenException, /* virtual coherence exception data */ /* * The user exception handlers. */ MipsUserIntr, /* 0 */ MipsUserGenException, /* 1 */ MipsTLBInvalidException,/* 2 */ MipsTLBInvalidException,/* 3 */ MipsUserGenException, /* 4 */ MipsUserGenException, /* 5 */ MipsUserGenException, /* 6 */ MipsUserGenException, /* 7 */ MipsUserGenException, /* 8 */ MipsUserGenException, /* 9 */ MipsUserGenException, /* 10 */ MipsUserGenException, /* 11 */ MipsUserGenException, /* 12 */ MipsUserGenException, /* 13 */ MipsUserGenException, /* 14 */ MipsUserGenException, /* 15 */ MipsUserGenException, /* 16 */ MipsUserGenException, /* 17 */ MipsUserGenException, /* 18 */ MipsUserGenException, /* 19 */ MipsUserGenException, /* 20 */ MipsUserGenException, /* 21 */ MipsUserGenException, /* 22 */ MipsUserGenException, /* 23 */ MipsUserGenException, /* 24 */ MipsUserGenException, /* 25 */ MipsUserGenException, /* 26 */ MipsUserGenException, /* 27 */ MipsUserGenException, /* 28 */ MipsUserGenException, /* 29 */ MipsUserGenException, /* 20 */ MipsUserGenException, /* 31 */ }; char *trap_type[] = { "external interrupt", "TLB modification", "TLB miss (load or instr. fetch)", "TLB miss (store)", "address error (load or I-fetch)", "address error (store)", "bus error (I-fetch)", "bus error (load or store)", "system call", "breakpoint", "reserved instruction", "coprocessor unusable", "arithmetic overflow", "trap", "virtual coherency instruction", "floating point", "reserved 16", "reserved 17", "reserved 18", "reserved 19", "reserved 20", "reserved 21", "reserved 22", "watch", "reserved 24", "reserved 25", "reserved 26", "reserved 27", "reserved 28", "reserved 29", "reserved 30", "virtual coherency data", }; #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) struct trapdebug trapdebug[TRAPSIZE], *trp = trapdebug; #endif #define KERNLAND(x) ((vm_offset_t)(x) >= VM_MIN_KERNEL_ADDRESS && (vm_offset_t)(x) < VM_MAX_KERNEL_ADDRESS) #define DELAYBRANCH(x) ((x) & MIPS_CR_BR_DELAY) /* * MIPS load/store access type */ enum { MIPS_LHU_ACCESS = 1, MIPS_LH_ACCESS, MIPS_LWU_ACCESS, MIPS_LW_ACCESS, MIPS_LD_ACCESS, MIPS_SH_ACCESS, MIPS_SW_ACCESS, MIPS_SD_ACCESS }; char *access_name[] = { "Load Halfword Unsigned", "Load Halfword", "Load Word Unsigned", "Load Word", "Load Doubleword", "Store Halfword", "Store Word", "Store Doubleword" }; #ifdef CPU_CNMIPS #include #endif static int allow_unaligned_acc = 1; SYSCTL_INT(_vm, OID_AUTO, allow_unaligned_acc, CTLFLAG_RW, &allow_unaligned_acc, 0, "Allow unaligned accesses"); /* * FP emulation is assumed to work on O32, but the code is outdated and crufty * enough that it's a more sensible default to have it disabled when using * other ABIs. At the very least, it needs a lot of help in using * type-semantic ABI-oblivious macros for everything it does. */ #if defined(__mips_o32) static int emulate_fp = 1; #else static int emulate_fp = 0; #endif SYSCTL_INT(_machdep, OID_AUTO, emulate_fp, CTLFLAG_RW, &emulate_fp, 0, "Emulate unimplemented FPU instructions"); static int emulate_unaligned_access(struct trapframe *frame, int mode); extern void fswintrberr(void); /* XXX */ int cpu_fetch_syscall_args(struct thread *td) { struct trapframe *locr0; struct sysentvec *se; struct syscall_args *sa; int error, nsaved; locr0 = td->td_frame; sa = &td->td_sa; bzero(sa->args, sizeof(sa->args)); /* compute next PC after syscall instruction */ td->td_pcb->pcb_tpc = locr0->pc; /* Remember if restart */ if (DELAYBRANCH(locr0->cause)) /* Check BD bit */ locr0->pc = MipsEmulateBranch(locr0, locr0->pc, 0, 0); else locr0->pc += sizeof(int); sa->code = locr0->v0; + sa->original_code = sa->code; switch (sa->code) { case SYS___syscall: case SYS_syscall: /* * This is an indirect syscall, in which the code is the first argument. */ #if (!defined(__mips_n32) && !defined(__mips_n64)) || defined(COMPAT_FREEBSD32) if (sa->code == SYS___syscall && SV_PROC_FLAG(td->td_proc, SV_ILP32)) { /* * Like syscall, but code is a quad, so as to maintain alignment * for the rest of the arguments. */ if (_QUAD_LOWWORD == 0) sa->code = locr0->a0; else sa->code = locr0->a1; sa->args[0] = locr0->a2; sa->args[1] = locr0->a3; nsaved = 2; break; } #endif /* * This is either not a quad syscall, or is a quad syscall with a * new ABI in which quads fit in a single register. */ sa->code = locr0->a0; sa->args[0] = locr0->a1; sa->args[1] = locr0->a2; sa->args[2] = locr0->a3; nsaved = 3; #if defined(__mips_n32) || defined(__mips_n64) #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) { #endif /* * Non-o32 ABIs support more arguments in registers. */ sa->args[3] = locr0->a4; sa->args[4] = locr0->a5; sa->args[5] = locr0->a6; sa->args[6] = locr0->a7; nsaved += 4; #ifdef COMPAT_FREEBSD32 } #endif #endif break; default: /* * A direct syscall, arguments are just parameters to the syscall. */ sa->args[0] = locr0->a0; sa->args[1] = locr0->a1; sa->args[2] = locr0->a2; sa->args[3] = locr0->a3; nsaved = 4; #if defined (__mips_n32) || defined(__mips_n64) #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) { #endif /* * Non-o32 ABIs support more arguments in registers. */ sa->args[4] = locr0->a4; sa->args[5] = locr0->a5; sa->args[6] = locr0->a6; sa->args[7] = locr0->a7; nsaved += 4; #ifdef COMPAT_FREEBSD32 } #endif #endif break; } #ifdef TRAP_DEBUG if (trap_debug) printf("SYSCALL #%d pid:%u\n", sa->code, td->td_proc->p_pid); #endif se = td->td_proc->p_sysent; /* * XXX * Shouldn't this go before switching on the code? */ if (sa->code >= se->sv_size) sa->callp = &se->sv_table[0]; else sa->callp = &se->sv_table[sa->code]; if (sa->callp->sy_narg > nsaved) { #if defined(__mips_n32) || defined(__mips_n64) /* * XXX * Is this right for new ABIs? I think the 4 there * should be 8, size there are 8 registers to skip, * not 4, but I'm not certain. */ #ifdef COMPAT_FREEBSD32 if (!SV_PROC_FLAG(td->td_proc, SV_ILP32)) #endif printf("SYSCALL #%u pid:%u, narg (%u) > nsaved (%u).\n", sa->code, td->td_proc->p_pid, sa->callp->sy_narg, nsaved); #endif #if (defined(__mips_n32) || defined(__mips_n64)) && defined(COMPAT_FREEBSD32) if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { unsigned i; int32_t arg; error = 0; /* XXX GCC is awful. */ for (i = nsaved; i < sa->callp->sy_narg; i++) { error = copyin((caddr_t)(intptr_t)(locr0->sp + (4 + (i - nsaved)) * sizeof(int32_t)), (caddr_t)&arg, sizeof arg); if (error != 0) break; sa->args[i] = arg; } } else #endif error = copyin((caddr_t)(intptr_t)(locr0->sp + 4 * sizeof(register_t)), (caddr_t)&sa->args[nsaved], (u_int)(sa->callp->sy_narg - nsaved) * sizeof(register_t)); if (error != 0) { locr0->v0 = error; locr0->a3 = 1; } } else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = locr0->v1; } return (error); } #undef __FBSDID #define __FBSDID(x) #include "../../kern/subr_syscall.c" /* * Handle an exception. * Called from MipsKernGenException() or MipsUserGenException() * when a processor trap occurs. * In the case of a kernel trap, we return the pc where to resume if * p->p_addr->u_pcb.pcb_onfault is set, otherwise, return old pc. */ register_t trap(struct trapframe *trapframe) { int type, usermode; int i = 0; unsigned ucode = 0; struct thread *td = curthread; struct proc *p = curproc; vm_prot_t ftype; pmap_t pmap; int access_type; ksiginfo_t ksi; char *msg = NULL; intptr_t addr = 0; register_t pc; int cop, error; register_t *frame_regs; #ifdef KDB bool handled; #endif trapdebug_enter(trapframe, 0); #ifdef KDB if (kdb_active) { kdb_reenter(); return (0); } #endif type = (trapframe->cause & MIPS_CR_EXC_CODE) >> MIPS_CR_EXC_CODE_SHIFT; if (TRAPF_USERMODE(trapframe)) { type |= T_USER; usermode = 1; } else { usermode = 0; } /* * Enable hardware interrupts if they were on before the trap. If it * was off disable all so we don't accidently enable it when doing a * return to userland. */ if (trapframe->sr & MIPS_SR_INT_IE) { set_intr_mask(trapframe->sr & MIPS_SR_INT_MASK); intr_enable(); } else { intr_disable(); } #ifdef TRAP_DEBUG if (trap_debug) { static vm_offset_t last_badvaddr = 0; static vm_offset_t this_badvaddr = 0; static int count = 0; u_int32_t pid; printf("trap type %x (%s - ", type, trap_type[type & (~T_USER)]); if (type & T_USER) printf("user mode)\n"); else printf("kernel mode)\n"); #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif pid = mips_rd_entryhi() & TLBHI_ASID_MASK; printf("badaddr = %#jx, pc = %#jx, ra = %#jx, sp = %#jx, sr = %jx, pid = %d, ASID = %u\n", (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra, (intmax_t)trapframe->sp, (intmax_t)trapframe->sr, (curproc ? curproc->p_pid : -1), pid); switch (type & ~T_USER) { case T_TLB_MOD: case T_TLB_LD_MISS: case T_TLB_ST_MISS: case T_ADDR_ERR_LD: case T_ADDR_ERR_ST: this_badvaddr = trapframe->badvaddr; break; case T_SYSCALL: this_badvaddr = trapframe->ra; break; default: this_badvaddr = trapframe->pc; break; } if ((last_badvaddr == this_badvaddr) && ((type & ~T_USER) != T_SYSCALL) && ((type & ~T_USER) != T_COP_UNUSABLE)) { if (++count == 3) { trap_frame_dump(trapframe); panic("too many faults at %p\n", (void *)last_badvaddr); } } else { last_badvaddr = this_badvaddr; count = 0; } } #endif #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ /* * XXXDTRACE: add pid probe handler here (if ever) */ if (!usermode) { if (dtrace_trap_func != NULL && (*dtrace_trap_func)(trapframe, type) != 0) return (trapframe->pc); } #endif switch (type) { case T_MCHECK: #ifdef DDB kdb_trap(type, 0, trapframe); #endif panic("MCHECK\n"); break; case T_TLB_MOD: /* check for kernel address */ if (KERNLAND(trapframe->badvaddr)) { if (pmap_emulate_modified(kernel_pmap, trapframe->badvaddr) != 0) { ftype = VM_PROT_WRITE; goto kernel_fault; } return (trapframe->pc); } /* FALLTHROUGH */ case T_TLB_MOD + T_USER: pmap = &p->p_vmspace->vm_pmap; if (pmap_emulate_modified(pmap, trapframe->badvaddr) != 0) { ftype = VM_PROT_WRITE; goto dofault; } if (!usermode) return (trapframe->pc); goto out; case T_TLB_LD_MISS: case T_TLB_ST_MISS: ftype = (type == T_TLB_ST_MISS) ? VM_PROT_WRITE : VM_PROT_READ; /* check for kernel address */ if (KERNLAND(trapframe->badvaddr)) { vm_offset_t va; int rv; kernel_fault: va = (vm_offset_t)trapframe->badvaddr; rv = vm_fault_trap(kernel_map, va, ftype, VM_FAULT_NORMAL, NULL, NULL); if (rv == KERN_SUCCESS) return (trapframe->pc); if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } goto err; } /* * It is an error for the kernel to access user space except * through the copyin/copyout routines. */ if (td->td_pcb->pcb_onfault == NULL) goto err; goto dofault; case T_TLB_LD_MISS + T_USER: ftype = VM_PROT_READ; goto dofault; case T_TLB_ST_MISS + T_USER: ftype = VM_PROT_WRITE; dofault: { vm_offset_t va; struct vmspace *vm; vm_map_t map; int rv = 0; vm = p->p_vmspace; map = &vm->vm_map; va = (vm_offset_t)trapframe->badvaddr; if (KERNLAND(trapframe->badvaddr)) { /* * Don't allow user-mode faults in kernel * address space. */ goto nogo; } rv = vm_fault_trap(map, va, ftype, VM_FAULT_NORMAL, &i, &ucode); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ #ifdef VMFAULT_TRACE printf("vm_fault(%p (pmap %p), %p (%p), %x, %d) -> %x at pc %p\n", map, &vm->vm_pmap, (void *)va, (void *)(intptr_t)trapframe->badvaddr, ftype, VM_FAULT_NORMAL, rv, (void *)(intptr_t)trapframe->pc); #endif if (rv == KERN_SUCCESS) { if (!usermode) { return (trapframe->pc); } goto out; } nogo: if (!usermode) { if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } goto err; } addr = trapframe->badvaddr; msg = "BAD_PAGE_FAULT"; log_bad_page_fault(msg, trapframe, type); break; } case T_ADDR_ERR_LD + T_USER: /* misaligned or kseg access */ case T_ADDR_ERR_ST + T_USER: /* misaligned or kseg access */ if (trapframe->badvaddr < 0 || trapframe->badvaddr >= VM_MAXUSER_ADDRESS) { msg = "ADDRESS_SPACE_ERR"; } else if (allow_unaligned_acc) { int mode; if (type == (T_ADDR_ERR_LD + T_USER)) mode = VM_PROT_READ; else mode = VM_PROT_WRITE; access_type = emulate_unaligned_access(trapframe, mode); if (access_type != 0) goto out; msg = "ALIGNMENT_FIX_ERR"; } else { msg = "ADDRESS_ERR"; } /* FALL THROUGH */ case T_BUS_ERR_IFETCH + T_USER: /* BERR asserted to cpu */ case T_BUS_ERR_LD_ST + T_USER: /* BERR asserted to cpu */ ucode = 0; /* XXX should be VM_PROT_something */ i = SIGBUS; addr = trapframe->pc; if (!msg) msg = "BUS_ERR"; log_bad_page_fault(msg, trapframe, type); break; case T_SYSCALL + T_USER: { syscallenter(td); #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) if (trp == trapdebug) trapdebug[TRAPSIZE - 1].code = td->td_sa.code; else trp[-1].code = td->td_sa.code; #endif trapdebug_enter(td->td_frame, -td->td_sa.code); /* * The sync'ing of I & D caches for SYS_ptrace() is * done by procfs_domem() through procfs_rwmem() * instead of being done here under a special check * for SYS_ptrace(). */ syscallret(td); return (trapframe->pc); } #if defined(KDTRACE_HOOKS) || defined(DDB) case T_BREAK: #ifdef KDTRACE_HOOKS if (!usermode && dtrace_invop_jump_addr != NULL && dtrace_invop_jump_addr(trapframe) == 0) return (trapframe->pc); #endif #ifdef DDB kdb_trap(type, 0, trapframe); return (trapframe->pc); #endif #endif case T_BREAK + T_USER: { intptr_t va; uint32_t instr; i = SIGTRAP; ucode = TRAP_BRKPT; /* compute address of break instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); addr = va; if (td->td_md.md_ss_addr != va) break; /* read break instruction */ instr = fuword32((caddr_t)va); if (instr != MIPS_BREAK_SSTEP) break; CTR3(KTR_PTRACE, "trap: tid %d, single step at %#lx: %#08x", td->td_tid, va, instr); PROC_LOCK(p); _PHOLD(p); error = ptrace_clear_single_step(td); _PRELE(p); PROC_UNLOCK(p); if (error == 0) ucode = TRAP_TRACE; break; } case T_IWATCH + T_USER: case T_DWATCH + T_USER: { intptr_t va; /* compute address of trapped instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); printf("watch exception @ %p\n", (void *)va); i = SIGTRAP; ucode = TRAP_BRKPT; addr = va; break; } case T_TRAP + T_USER: { intptr_t va; struct trapframe *locr0 = td->td_frame; /* compute address of trap instruction */ va = trapframe->pc; if (DELAYBRANCH(trapframe->cause)) va += sizeof(int); if (DELAYBRANCH(trapframe->cause)) { /* Check BD bit */ locr0->pc = MipsEmulateBranch(locr0, trapframe->pc, 0, 0); } else { locr0->pc += sizeof(int); } addr = va; i = SIGEMT; /* Stuff it with something for now */ break; } case T_RES_INST + T_USER: { InstFmt inst; inst = *(InstFmt *)(intptr_t)trapframe->pc; switch (inst.RType.op) { case OP_SPECIAL3: switch (inst.RType.func) { case OP_RDHWR: /* Register 29 used for TLS */ if (inst.RType.rd == 29) { frame_regs = &(trapframe->zero); frame_regs[inst.RType.rt] = (register_t)(intptr_t)td->td_md.md_tls; frame_regs[inst.RType.rt] += td->td_proc->p_md.md_tls_tcb_offset; trapframe->pc += sizeof(int); goto out; } break; } break; } log_illegal_instruction("RES_INST", trapframe); i = SIGILL; addr = trapframe->pc; } break; case T_C2E: case T_C2E + T_USER: goto err; break; case T_COP_UNUSABLE: #ifdef CPU_CNMIPS cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT; /* Handle only COP2 exception */ if (cop != 2) goto err; addr = trapframe->pc; /* save userland cop2 context if it has been touched */ if ((td->td_md.md_flags & MDTD_COP2USED) && (td->td_md.md_cop2owner == COP2_OWNER_USERLAND)) { if (td->td_md.md_ucop2) octeon_cop2_save(td->td_md.md_ucop2); else panic("COP2 was used in user mode but md_ucop2 is NULL"); } if (td->td_md.md_cop2 == NULL) { td->td_md.md_cop2 = octeon_cop2_alloc_ctx(); if (td->td_md.md_cop2 == NULL) panic("Failed to allocate COP2 context"); memset(td->td_md.md_cop2, 0, sizeof(*td->td_md.md_cop2)); } octeon_cop2_restore(td->td_md.md_cop2); /* Make userland re-request its context */ td->td_frame->sr &= ~MIPS_SR_COP_2_BIT; td->td_md.md_flags |= MDTD_COP2USED; td->td_md.md_cop2owner = COP2_OWNER_KERNEL; /* Enable COP2, it will be disabled in cpu_switch */ mips_wr_status(mips_rd_status() | MIPS_SR_COP_2_BIT); return (trapframe->pc); #else goto err; break; #endif case T_COP_UNUSABLE + T_USER: cop = (trapframe->cause & MIPS_CR_COP_ERR) >> MIPS_CR_COP_ERR_SHIFT; if (cop == 1) { /* FP (COP1) instruction */ if (cpuinfo.fpu_id == 0) { log_illegal_instruction("COP1_UNUSABLE", trapframe); i = SIGILL; break; } addr = trapframe->pc; MipsSwitchFPState(PCPU_GET(fpcurthread), td->td_frame); PCPU_SET(fpcurthread, td); #if defined(__mips_n32) || defined(__mips_n64) td->td_frame->sr |= MIPS_SR_COP_1_BIT | MIPS_SR_FR; #else td->td_frame->sr |= MIPS_SR_COP_1_BIT; #endif td->td_md.md_flags |= MDTD_FPUSED; goto out; } #ifdef CPU_CNMIPS else if (cop == 2) { addr = trapframe->pc; if ((td->td_md.md_flags & MDTD_COP2USED) && (td->td_md.md_cop2owner == COP2_OWNER_KERNEL)) { if (td->td_md.md_cop2) octeon_cop2_save(td->td_md.md_cop2); else panic("COP2 was used in kernel mode but md_cop2 is NULL"); } if (td->td_md.md_ucop2 == NULL) { td->td_md.md_ucop2 = octeon_cop2_alloc_ctx(); if (td->td_md.md_ucop2 == NULL) panic("Failed to allocate userland COP2 context"); memset(td->td_md.md_ucop2, 0, sizeof(*td->td_md.md_ucop2)); } octeon_cop2_restore(td->td_md.md_ucop2); td->td_frame->sr |= MIPS_SR_COP_2_BIT; td->td_md.md_flags |= MDTD_COP2USED; td->td_md.md_cop2owner = COP2_OWNER_USERLAND; goto out; } #endif else { log_illegal_instruction("COPn_UNUSABLE", trapframe); i = SIGILL; /* only FPU instructions allowed */ break; } case T_FPE: #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) trapDump("fpintr"); #else printf("FPU Trap: PC %#jx CR %x SR %x\n", (intmax_t)trapframe->pc, (unsigned)trapframe->cause, (unsigned)trapframe->sr); goto err; #endif case T_FPE + T_USER: if (!emulate_fp) { i = SIGFPE; addr = trapframe->pc; break; } MipsFPTrap(trapframe->sr, trapframe->cause, trapframe->pc); goto out; case T_OVFLOW + T_USER: i = SIGFPE; addr = trapframe->pc; break; case T_ADDR_ERR_LD: /* misaligned access */ case T_ADDR_ERR_ST: /* misaligned access */ #ifdef TRAP_DEBUG if (trap_debug) { printf("+++ ADDR_ERR: type = %d, badvaddr = %#jx\n", type, (intmax_t)trapframe->badvaddr); } #endif /* Only allow emulation on a user address */ if (allow_unaligned_acc && ((vm_offset_t)trapframe->badvaddr < VM_MAXUSER_ADDRESS)) { int mode; if (type == T_ADDR_ERR_LD) mode = VM_PROT_READ; else mode = VM_PROT_WRITE; access_type = emulate_unaligned_access(trapframe, mode); if (access_type != 0) return (trapframe->pc); } /* FALLTHROUGH */ case T_BUS_ERR_LD_ST: /* BERR asserted to cpu */ if (td->td_pcb->pcb_onfault != NULL) { pc = (register_t)(intptr_t)td->td_pcb->pcb_onfault; td->td_pcb->pcb_onfault = NULL; return (pc); } /* FALLTHROUGH */ default: err: #if !defined(SMP) && defined(DEBUG) trapDump("trap"); #endif #ifdef SMP printf("cpu:%d-", PCPU_GET(cpuid)); #endif printf("Trap cause = %d (%s - ", type, trap_type[type & (~T_USER)]); if (type & T_USER) printf("user mode)\n"); else printf("kernel mode)\n"); #ifdef TRAP_DEBUG if (trap_debug) printf("badvaddr = %#jx, pc = %#jx, ra = %#jx, sr = %#jxx\n", (intmax_t)trapframe->badvaddr, (intmax_t)trapframe->pc, (intmax_t)trapframe->ra, (intmax_t)trapframe->sr); #endif #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(type, 0, trapframe); kdb_why = KDB_WHY_UNSET; if (handled) return (trapframe->pc); } #endif panic("trap"); } td->td_frame->pc = trapframe->pc; td->td_frame->cause = trapframe->cause; td->td_frame->badvaddr = trapframe->badvaddr; ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type & ~T_USER; trapsignal(td, &ksi); out: /* * Note: we should only get here if returning to user mode. */ userret(td, trapframe); return (trapframe->pc); } #if !defined(SMP) && (defined(DDB) || defined(DEBUG)) void trapDump(char *msg) { register_t s; int i; s = intr_disable(); printf("trapDump(%s)\n", msg); for (i = 0; i < TRAPSIZE; i++) { if (trp == trapdebug) { trp = &trapdebug[TRAPSIZE - 1]; } else { trp--; } if (trp->cause == 0) break; printf("%s: ADR %jx PC %jx CR %jx SR %jx\n", trap_type[(trp->cause & MIPS_CR_EXC_CODE) >> MIPS_CR_EXC_CODE_SHIFT], (intmax_t)trp->vadr, (intmax_t)trp->pc, (intmax_t)trp->cause, (intmax_t)trp->status); printf(" RA %jx SP %jx code %d\n", (intmax_t)trp->ra, (intmax_t)trp->sp, (int)trp->code); } intr_restore(s); } #endif /* * Return the resulting PC as if the branch was executed. */ uintptr_t MipsEmulateBranch(struct trapframe *framePtr, uintptr_t instPC, int fpcCSR, uintptr_t instptr) { InstFmt inst; register_t *regsPtr = (register_t *) framePtr; uintptr_t retAddr = 0; int condition; #define GetBranchDest(InstPtr, inst) \ (InstPtr + 4 + ((short)inst.IType.imm << 2)) if (instptr) { if (instptr < MIPS_KSEG0_START) inst.word = fuword32((void *)instptr); else inst = *(InstFmt *) instptr; } else { if ((vm_offset_t)instPC < MIPS_KSEG0_START) inst.word = fuword32((void *)instPC); else inst = *(InstFmt *) instPC; } switch ((int)inst.JType.op) { case OP_SPECIAL: switch ((int)inst.RType.func) { case OP_JR: case OP_JALR: retAddr = regsPtr[inst.RType.rs]; break; default: retAddr = instPC + 4; break; } break; case OP_BCOND: switch ((int)inst.IType.rt) { case OP_BLTZ: case OP_BLTZL: case OP_BLTZAL: case OP_BLTZALL: if ((int)(regsPtr[inst.RType.rs]) < 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BGEZ: case OP_BGEZL: case OP_BGEZAL: case OP_BGEZALL: if ((int)(regsPtr[inst.RType.rs]) >= 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_TGEI: case OP_TGEIU: case OP_TLTI: case OP_TLTIU: case OP_TEQI: case OP_TNEI: retAddr = instPC + 4; /* Like syscall... */ break; default: panic("MipsEmulateBranch: Bad branch cond"); } break; case OP_J: case OP_JAL: retAddr = (inst.JType.target << 2) | ((unsigned)(instPC + 4) & 0xF0000000); break; case OP_BEQ: case OP_BEQL: if (regsPtr[inst.RType.rs] == regsPtr[inst.RType.rt]) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BNE: case OP_BNEL: if (regsPtr[inst.RType.rs] != regsPtr[inst.RType.rt]) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BLEZ: case OP_BLEZL: if ((int)(regsPtr[inst.RType.rs]) <= 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_BGTZ: case OP_BGTZL: if ((int)(regsPtr[inst.RType.rs]) > 0) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; case OP_COP1: switch (inst.RType.rs) { case OP_BCx: case OP_BCy: if ((inst.RType.rt & COPz_BC_TF_MASK) == COPz_BC_TRUE) condition = fpcCSR & MIPS_FPU_COND_BIT; else condition = !(fpcCSR & MIPS_FPU_COND_BIT); if (condition) retAddr = GetBranchDest(instPC, inst); else retAddr = instPC + 8; break; default: retAddr = instPC + 4; } break; default: retAddr = instPC + 4; } return (retAddr); } static void log_frame_dump(struct trapframe *frame) { log(LOG_ERR, "Trapframe Register Dump:\n"); log(LOG_ERR, "\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n", (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1); log(LOG_ERR, "\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n", (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3); #if defined(__mips_n32) || defined(__mips_n64) log(LOG_ERR, "\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta6: %#jx\n", (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7); log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); #else log(LOG_ERR, "\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); log(LOG_ERR, "\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n", (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7); #endif log(LOG_ERR, "\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n", (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1); log(LOG_ERR, "\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n", (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5); log(LOG_ERR, "\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n", (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1); log(LOG_ERR, "\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n", (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra); log(LOG_ERR, "\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n", (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr); log(LOG_ERR, "\tcause: %#jx\tpc: %#jx\n", (intmax_t)frame->cause, (intmax_t)frame->pc); } #ifdef TRAP_DEBUG static void trap_frame_dump(struct trapframe *frame) { printf("Trapframe Register Dump:\n"); printf("\tzero: %#jx\tat: %#jx\tv0: %#jx\tv1: %#jx\n", (intmax_t)0, (intmax_t)frame->ast, (intmax_t)frame->v0, (intmax_t)frame->v1); printf("\ta0: %#jx\ta1: %#jx\ta2: %#jx\ta3: %#jx\n", (intmax_t)frame->a0, (intmax_t)frame->a1, (intmax_t)frame->a2, (intmax_t)frame->a3); #if defined(__mips_n32) || defined(__mips_n64) printf("\ta4: %#jx\ta5: %#jx\ta6: %#jx\ta7: %#jx\n", (intmax_t)frame->a4, (intmax_t)frame->a5, (intmax_t)frame->a6, (intmax_t)frame->a7); printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); #else printf("\tt0: %#jx\tt1: %#jx\tt2: %#jx\tt3: %#jx\n", (intmax_t)frame->t0, (intmax_t)frame->t1, (intmax_t)frame->t2, (intmax_t)frame->t3); printf("\tt4: %#jx\tt5: %#jx\tt6: %#jx\tt7: %#jx\n", (intmax_t)frame->t4, (intmax_t)frame->t5, (intmax_t)frame->t6, (intmax_t)frame->t7); #endif printf("\tt8: %#jx\tt9: %#jx\ts0: %#jx\ts1: %#jx\n", (intmax_t)frame->t8, (intmax_t)frame->t9, (intmax_t)frame->s0, (intmax_t)frame->s1); printf("\ts2: %#jx\ts3: %#jx\ts4: %#jx\ts5: %#jx\n", (intmax_t)frame->s2, (intmax_t)frame->s3, (intmax_t)frame->s4, (intmax_t)frame->s5); printf("\ts6: %#jx\ts7: %#jx\tk0: %#jx\tk1: %#jx\n", (intmax_t)frame->s6, (intmax_t)frame->s7, (intmax_t)frame->k0, (intmax_t)frame->k1); printf("\tgp: %#jx\tsp: %#jx\ts8: %#jx\tra: %#jx\n", (intmax_t)frame->gp, (intmax_t)frame->sp, (intmax_t)frame->s8, (intmax_t)frame->ra); printf("\tsr: %#jx\tmullo: %#jx\tmulhi: %#jx\tbadvaddr: %#jx\n", (intmax_t)frame->sr, (intmax_t)frame->mullo, (intmax_t)frame->mulhi, (intmax_t)frame->badvaddr); printf("\tcause: %#jx\tpc: %#jx\n", (intmax_t)frame->cause, (intmax_t)frame->pc); } #endif static void get_mapping_info(vm_offset_t va, pd_entry_t **pdepp, pt_entry_t **ptepp) { pt_entry_t *ptep; pd_entry_t *pdep; struct proc *p = curproc; pdep = (&(p->p_vmspace->vm_pmap.pm_segtab[(va >> SEGSHIFT) & (NPDEPG - 1)])); if (*pdep) ptep = pmap_pte(&p->p_vmspace->vm_pmap, va); else ptep = (pt_entry_t *)0; *pdepp = pdep; *ptepp = ptep; } static void log_illegal_instruction(const char *msg, struct trapframe *frame) { pt_entry_t *ptep; pd_entry_t *pdep; unsigned int *addr, instr[4]; struct thread *td; struct proc *p; register_t pc; td = curthread; p = td->td_proc; #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx ra %#jx\n", msg, p->p_pid, (long)td->td_tid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, (intmax_t)pc, (intmax_t)frame->ra); /* log registers in trap frame */ log_frame_dump(frame); get_mapping_info((vm_offset_t)pc, &pdep, &ptep); /* * Dump a few words around faulting instruction, if the addres is * valid. */ addr = (unsigned int *)(intptr_t)pc; if ((pc & 3) == 0 && copyin(addr, instr, sizeof(instr)) == 0) { /* dump page table entry for faulting instruction */ log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n", addr); log(LOG_ERR, "%08x %08x %08x %08x\n", instr[0], instr[1], instr[2], instr[3]); } else { log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } } static void log_bad_page_fault(char *msg, struct trapframe *frame, int trap_type) { pt_entry_t *ptep; pd_entry_t *pdep; unsigned int *addr, instr[4]; struct thread *td; struct proc *p; char *read_or_write; register_t pc; trap_type &= ~T_USER; td = curthread; p = td->td_proc; #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif switch (trap_type) { case T_TLB_MOD: case T_TLB_ST_MISS: case T_ADDR_ERR_ST: read_or_write = "write"; break; case T_TLB_LD_MISS: case T_ADDR_ERR_LD: case T_BUS_ERR_IFETCH: read_or_write = "read"; break; default: read_or_write = "unknown"; } pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); log(LOG_ERR, "%s: pid %d tid %ld (%s), uid %d: pc %#jx got a %s fault " "(type %#x) at %#jx\n", msg, p->p_pid, (long)td->td_tid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, (intmax_t)pc, read_or_write, trap_type, (intmax_t)frame->badvaddr); /* log registers in trap frame */ log_frame_dump(frame); get_mapping_info((vm_offset_t)pc, &pdep, &ptep); /* * Dump a few words around faulting instruction, if the addres is * valid. */ addr = (unsigned int *)(intptr_t)pc; if ((pc & 3) == 0 && pc != frame->badvaddr && trap_type != T_BUS_ERR_IFETCH && copyin((caddr_t)(intptr_t)pc, instr, sizeof(instr)) == 0) { /* dump page table entry for faulting instruction */ log(LOG_ERR, "Page table info for pc address %#jx: pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); log(LOG_ERR, "Dumping 4 words starting at pc address %p: \n", addr); log(LOG_ERR, "%08x %08x %08x %08x\n", instr[0], instr[1], instr[2], instr[3]); } else { log(LOG_ERR, "pc address %#jx is inaccessible, pde = %p, pte = %#jx\n", (intmax_t)pc, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } get_mapping_info((vm_offset_t)frame->badvaddr, &pdep, &ptep); log(LOG_ERR, "Page table info for bad address %#jx: pde = %p, pte = %#jx\n", (intmax_t)frame->badvaddr, (void *)(intptr_t)*pdep, (uintmax_t)(ptep ? *ptep : 0)); } /* * Unaligned load/store emulation */ static int mips_unaligned_load_store(struct trapframe *frame, int mode, register_t addr, register_t pc) { register_t *reg = (register_t *) frame; u_int32_t inst = *((u_int32_t *)(intptr_t)pc); register_t value_msb = 0, value = 0; unsigned size; /* * ADDR_ERR faults have higher priority than TLB * Miss faults. Therefore, it is necessary to * verify that the faulting address is a valid * virtual address within the process' address space * before trying to emulate the unaligned access. */ switch (MIPS_INST_OPCODE(inst)) { case OP_LHU: case OP_LH: case OP_SH: size = 2; break; case OP_LWU: case OP_LW: case OP_SW: size = 4; break; case OP_LD: case OP_SD: size = 8; break; default: printf("%s: unhandled opcode in address error: %#x\n", __func__, MIPS_INST_OPCODE(inst)); return (0); } if (!useracc((void *)rounddown2((vm_offset_t)addr, size), size * 2, mode)) return (0); /* * XXX * Handle LL/SC LLD/SCD. */ switch (MIPS_INST_OPCODE(inst)) { case OP_LHU: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lbu_macro(value_msb, addr); addr += 1; lbu_macro(value, addr); value |= value_msb << 8; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LHU_ACCESS); case OP_LH: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lb_macro(value_msb, addr); addr += 1; lbu_macro(value, addr); value |= value_msb << 8; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LH_ACCESS); case OP_LWU: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lwl_macro(value, addr); addr += 3; lwr_macro(value, addr); value &= 0xffffffff; reg[MIPS_INST_RT(inst)] = value; return (MIPS_LWU_ACCESS); case OP_LW: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); lwl_macro(value, addr); addr += 3; lwr_macro(value, addr); reg[MIPS_INST_RT(inst)] = value; return (MIPS_LW_ACCESS); #if defined(__mips_n32) || defined(__mips_n64) case OP_LD: KASSERT(mode == VM_PROT_READ, ("access mode must be read for load instruction.")); ldl_macro(value, addr); addr += 7; ldr_macro(value, addr); reg[MIPS_INST_RT(inst)] = value; return (MIPS_LD_ACCESS); #endif case OP_SH: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; value_msb = value >> 8; sb_macro(value_msb, addr); addr += 1; sb_macro(value, addr); return (MIPS_SH_ACCESS); case OP_SW: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; swl_macro(value, addr); addr += 3; swr_macro(value, addr); return (MIPS_SW_ACCESS); #if defined(__mips_n32) || defined(__mips_n64) case OP_SD: KASSERT(mode == VM_PROT_WRITE, ("access mode must be write for store instruction.")); value = reg[MIPS_INST_RT(inst)]; sdl_macro(value, addr); addr += 7; sdr_macro(value, addr); return (MIPS_SD_ACCESS); #endif } panic("%s: should not be reached.", __func__); } /* * XXX TODO: SMP? */ static struct timeval unaligned_lasterr; static int unaligned_curerr; static int unaligned_pps_log_limit = 4; SYSCTL_INT(_machdep, OID_AUTO, unaligned_log_pps_limit, CTLFLAG_RWTUN, &unaligned_pps_log_limit, 0, "limit number of userland unaligned log messages per second"); static int emulate_unaligned_access(struct trapframe *frame, int mode) { register_t pc; int access_type = 0; struct thread *td = curthread; struct proc *p = curproc; pc = frame->pc + (DELAYBRANCH(frame->cause) ? 4 : 0); /* * Fall through if it's instruction fetch exception */ if (!((pc & 3) || (pc == frame->badvaddr))) { /* * Handle unaligned load and store */ /* * Return access type if the instruction was emulated. * Otherwise restore pc and fall through. */ access_type = mips_unaligned_load_store(frame, mode, frame->badvaddr, pc); if (access_type) { if (DELAYBRANCH(frame->cause)) frame->pc = MipsEmulateBranch(frame, frame->pc, 0, 0); else frame->pc += 4; if (ppsratecheck(&unaligned_lasterr, &unaligned_curerr, unaligned_pps_log_limit)) { /* XXX TODO: keep global/tid/pid counters? */ log(LOG_INFO, "Unaligned %s: pid=%ld (%s), tid=%ld, " "pc=%#jx, badvaddr=%#jx\n", access_name[access_type - 1], (long) p->p_pid, p->p_comm, (long) td->td_tid, (intmax_t)pc, (intmax_t)frame->badvaddr); } } } return access_type; } diff --git a/sys/powerpc/include/proc.h b/sys/powerpc/include/proc.h index d4df3ccfefef..aac4e66b39fc 100644 --- a/sys/powerpc/include/proc.h +++ b/sys/powerpc/include/proc.h @@ -1,83 +1,84 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: proc.h,v 1.2 1997/04/16 22:57:48 thorpej Exp $ * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ /* * Machine-dependent part of the proc structure */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_msr; /* (k) */ }; struct mdproc { /* * Avoid empty structs because they are undefined behavior. */ long md_spare; }; #ifdef __powerpc64__ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 816 #else #define KINFO_PROC_SIZE 816 #endif #define MAXARGS 8 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[MAXARGS]; }; #ifdef _KERNEL #include /* Get the current kernel thread stack usage. */ #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE - sizeof(struct pcb); \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) #endif #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/powerpc/powerpc/trap.c b/sys/powerpc/powerpc/trap.c index 3a1bdf7cde07..ab5189a0f418 100644 --- a/sys/powerpc/powerpc/trap.c +++ b/sys/powerpc/powerpc/trap.c @@ -1,1015 +1,1016 @@ /*- * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: trap.c,v 1.58 2002/03/04 04:07:35 dbj Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Below matches setjmp.S */ #define FAULTBUF_LR 21 #define FAULTBUF_R1 1 #define FAULTBUF_R2 2 #define FAULTBUF_CR 22 #define FAULTBUF_R14 3 #define MOREARGS(sp) ((caddr_t)((uintptr_t)(sp) + \ sizeof(struct callframe) - 3*sizeof(register_t))) /* more args go here */ static void trap_fatal(struct trapframe *frame); static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user); static bool trap_pfault(struct trapframe *frame, bool user, int *signo, int *ucode); static int fix_unaligned(struct thread *td, struct trapframe *frame); static int handle_onfault(struct trapframe *frame); static void syscall(struct trapframe *frame); #if defined(__powerpc64__) && defined(AIM) static void normalize_inputs(void); #endif extern vm_offset_t __startkernel; extern int copy_fault(void); extern int fusufault(void); #ifdef KDB int db_trap_glue(struct trapframe *); /* Called from trap_subr.S */ #endif struct powerpc_exception { u_int vector; char *name; }; #ifdef KDTRACE_HOOKS #include int (*dtrace_invop_jump_addr)(struct trapframe *); #endif static struct powerpc_exception powerpc_exceptions[] = { { EXC_CRIT, "critical input" }, { EXC_RST, "system reset" }, { EXC_MCHK, "machine check" }, { EXC_DSI, "data storage interrupt" }, { EXC_DSE, "data segment exception" }, { EXC_ISI, "instruction storage interrupt" }, { EXC_ISE, "instruction segment exception" }, { EXC_EXI, "external interrupt" }, { EXC_ALI, "alignment" }, { EXC_PGM, "program" }, { EXC_HEA, "hypervisor emulation assistance" }, { EXC_FPU, "floating-point unavailable" }, { EXC_APU, "auxiliary proc unavailable" }, { EXC_DECR, "decrementer" }, { EXC_FIT, "fixed-interval timer" }, { EXC_WDOG, "watchdog timer" }, { EXC_SC, "system call" }, { EXC_TRC, "trace" }, { EXC_FPA, "floating-point assist" }, { EXC_DEBUG, "debug" }, { EXC_PERF, "performance monitoring" }, { EXC_VEC, "altivec unavailable" }, { EXC_VSX, "vsx unavailable" }, { EXC_FAC, "facility unavailable" }, { EXC_ITMISS, "instruction tlb miss" }, { EXC_DLMISS, "data load tlb miss" }, { EXC_DSMISS, "data store tlb miss" }, { EXC_BPT, "instruction breakpoint" }, { EXC_SMI, "system management" }, { EXC_VECAST_G4, "altivec assist" }, { EXC_THRM, "thermal management" }, { EXC_RUNMODETRC, "run mode/trace" }, { EXC_SOFT_PATCH, "soft patch exception" }, { EXC_LAST, NULL } }; static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); #define ESR_BITMASK \ "\20" \ "\040b0\037b1\036b2\035b3\034PIL\033PRR\032PTR\031FP" \ "\030ST\027b9\026DLK\025ILK\024b12\023b13\022BO\021PIE" \ "\020b16\017b17\016b18\015b19\014b20\013b21\012b22\011b23" \ "\010SPE\007EPID\006b26\005b27\004b28\003b29\002b30\001b31" #define MCSR_BITMASK \ "\20" \ "\040MCP\037ICERR\036DCERR\035TLBPERR\034L2MMU_MHIT\033b5\032b6\031b7" \ "\030b8\027b9\026b10\025NMI\024MAV\023MEA\022b14\021IF" \ "\020LD\017ST\016LDG\015b19\014b20\013b21\012b22\011b23" \ "\010b24\007b25\006b26\005b27\004b28\003b29\002TLBSYNC\001BSL2_ERR" #define MSSSR_BITMASK \ "\20" \ "\040b0\037b1\036b2\035b3\034b4\033b5\032b6\031b7" \ "\030b8\027b9\026b10\025b11\024b12\023L2TAG\022L2DAT\021L3TAG" \ "\020L3DAT\017APE\016DPE\015TEA\014b20\013b21\012b22\011b23" \ "\010b24\007b25\006b26\005b27\004b28\003b29\002b30\001b31" static const char * trapname(u_int vector) { struct powerpc_exception *pe; for (pe = powerpc_exceptions; pe->vector != EXC_LAST; pe++) { if (pe->vector == vector) return (pe->name); } return ("unknown"); } static inline bool frame_is_trap_inst(struct trapframe *frame) { #ifdef AIM return (frame->exc == EXC_PGM && frame->srr1 & EXC_PGM_TRAP); #else return ((frame->cpu.booke.esr & ESR_PTR) != 0); #endif } void trap(struct trapframe *frame) { struct thread *td; struct proc *p; #ifdef KDTRACE_HOOKS uint32_t inst; #endif int sig, type, user; u_int ucode; ksiginfo_t ksi; register_t addr, fscr; VM_CNT_INC(v_trap); #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif td = curthread; p = td->td_proc; type = ucode = frame->exc; sig = 0; user = frame->srr1 & PSL_PR; addr = 0; CTR3(KTR_TRAP, "trap: %s type=%s (%s)", td->td_name, trapname(type), user ? "user" : "kernel"); #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * If the DTrace kernel module has registered a trap handler, * call it and if it returns non-zero, assume that it has * handled the trap and modified the trap frame so that this * function can return normally. */ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type) != 0) return; #endif if (user) { td->td_pticks = 0; td->td_frame = frame; addr = frame->srr0; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); /* User Mode Traps */ switch (type) { case EXC_RUNMODETRC: case EXC_TRC: frame->srr1 &= ~PSL_SE; sig = SIGTRAP; ucode = TRAP_TRACE; break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: addr = frame->dar; /* FALLTHROUGH */ case EXC_ISE: /* DSE/ISE are automatically fatal with radix pmap. */ if (radix_mmu || handle_user_slb_spill(&p->p_vmspace->vm_pmap, addr) != 0){ sig = SIGSEGV; ucode = SEGV_MAPERR; } break; #endif case EXC_DSI: addr = frame->dar; /* FALLTHROUGH */ case EXC_ISI: if (trap_pfault(frame, true, &sig, &ucode)) sig = 0; break; case EXC_SC: syscall(frame); break; case EXC_FPU: KASSERT((td->td_pcb->pcb_flags & PCB_FPU) != PCB_FPU, ("FPU already enabled for thread")); enable_fpu(td); break; case EXC_VEC: KASSERT((td->td_pcb->pcb_flags & PCB_VEC) != PCB_VEC, ("Altivec already enabled for thread")); enable_vec(td); break; case EXC_VSX: KASSERT((td->td_pcb->pcb_flags & PCB_VSX) != PCB_VSX, ("VSX already enabled for thread")); if (!(td->td_pcb->pcb_flags & PCB_VEC)) enable_vec(td); if (td->td_pcb->pcb_flags & PCB_FPU) save_fpu(td); td->td_pcb->pcb_flags |= PCB_VSX; enable_fpu(td); break; case EXC_FAC: fscr = mfspr(SPR_FSCR); switch (fscr & FSCR_IC_MASK) { case FSCR_IC_HTM: CTR0(KTR_TRAP, "Hardware Transactional Memory subsystem disabled"); sig = SIGILL; ucode = ILL_ILLOPC; break; case FSCR_IC_DSCR: td->td_pcb->pcb_flags |= PCB_CFSCR | PCB_CDSCR; fscr |= FSCR_DSCR; mtspr(SPR_DSCR, 0); break; case FSCR_IC_EBB: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_EBB; mtspr(SPR_EBBHR, 0); mtspr(SPR_EBBRR, 0); mtspr(SPR_BESCR, 0); break; case FSCR_IC_TAR: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_TAR; mtspr(SPR_TAR, 0); break; case FSCR_IC_LM: td->td_pcb->pcb_flags |= PCB_CFSCR; fscr |= FSCR_LM; mtspr(SPR_LMRR, 0); mtspr(SPR_LMSER, 0); break; default: sig = SIGILL; ucode = ILL_ILLOPC; } mtspr(SPR_FSCR, fscr & ~FSCR_IC_MASK); break; case EXC_HEA: sig = SIGILL; ucode = ILL_ILLOPC; break; case EXC_VECAST_E: case EXC_VECAST_G4: case EXC_VECAST_G5: /* * We get a VPU assist exception for IEEE mode * vector operations on denormalized floats. * Emulating this is a giant pain, so for now, * just switch off IEEE mode and treat them as * zero. */ save_vec(td); td->td_pcb->pcb_vec.vscr |= ALTIVEC_VSCR_NJ; enable_vec(td); break; case EXC_ALI: if (fix_unaligned(td, frame) != 0) { sig = SIGBUS; ucode = BUS_ADRALN; addr = frame->dar; } else frame->srr0 += 4; break; case EXC_DEBUG: /* Single stepping */ mtspr(SPR_DBSR, mfspr(SPR_DBSR)); frame->srr1 &= ~PSL_DE; frame->cpu.booke.dbcr0 &= ~(DBCR0_IDM | DBCR0_IC); sig = SIGTRAP; ucode = TRAP_TRACE; break; case EXC_PGM: /* Identify the trap reason */ if (frame_is_trap_inst(frame)) { #ifdef KDTRACE_HOOKS inst = fuword32((const void *)frame->srr0); if (inst == 0x0FFFDDDD && dtrace_pid_probe_ptr != NULL) { (*dtrace_pid_probe_ptr)(frame); break; } #endif sig = SIGTRAP; ucode = TRAP_BRKPT; break; } if ((frame->srr1 & EXC_PGM_FPENABLED) && (td->td_pcb->pcb_flags & PCB_FPU)) sig = SIGFPE; else sig = ppc_instr_emulate(frame, td); if (sig == SIGILL) { if (frame->srr1 & EXC_PGM_PRIV) ucode = ILL_PRVOPC; else if (frame->srr1 & EXC_PGM_ILLEGAL) ucode = ILL_ILLOPC; } else if (sig == SIGFPE) { ucode = get_fpu_exception(td); } break; case EXC_MCHK: sig = cpu_machine_check(td, frame, &ucode); printtrap(frame->exc, frame, 0, (frame->srr1 & PSL_PR)); break; #if defined(__powerpc64__) && defined(AIM) case EXC_SOFT_PATCH: /* * Point to the instruction that generated the exception to execute it again, * and normalize the register values. */ frame->srr0 -= 4; normalize_inputs(); break; #endif default: trap_fatal(frame); } } else { /* Kernel Mode Traps */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case EXC_PGM: #ifdef KDTRACE_HOOKS if (frame_is_trap_inst(frame)) { if (*(uint32_t *)frame->srr0 == EXC_DTRACE) { if (dtrace_invop_jump_addr != NULL) { dtrace_invop_jump_addr(frame); return; } } } #endif #ifdef KDB if (db_trap_glue(frame)) return; #endif break; #if defined(__powerpc64__) && defined(AIM) case EXC_DSE: /* DSE on radix mmu is automatically fatal. */ if (radix_mmu) break; if (td->td_pcb->pcb_cpu.aim.usr_vsid != 0 && (frame->dar & SEGMENT_MASK) == USER_ADDR) { __asm __volatile ("slbmte %0, %1" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); return; } break; #endif case EXC_DSI: if (trap_pfault(frame, false, NULL, NULL)) return; break; case EXC_MCHK: if (handle_onfault(frame)) return; break; default: break; } trap_fatal(frame); } if (sig != 0) { if (p->p_sysent->sv_transtrap != NULL) sig = (p->p_sysent->sv_transtrap)(sig, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = (int) ucode; /* XXX, not POSIX */ ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d code %d type 0x%x " "addr 0x%lx r1 0x%lx srr0 0x%lx srr1 0x%lx\n", p->p_pid, p->p_comm, sig, ucode, type, (u_long)addr, (u_long)frame->fixreg[1], (u_long)frame->srr0, (u_long)frame->srr1); } trapsignal(td, &ksi); } userret(td, frame); } static void trap_fatal(struct trapframe *frame) { #ifdef KDB bool handled; #endif printtrap(frame->exc, frame, 1, (frame->srr1 & PSL_PR)); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(frame->exc, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("%s trap", trapname(frame->exc)); } static void cpu_printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { #ifdef AIM uint16_t ver; switch (vector) { case EXC_MCHK: ver = mfpvr() >> 16; if (MPC745X_P(ver)) printf(" msssr0 = 0x%b\n", (int)mfspr(SPR_MSSSR0), MSSSR_BITMASK); case EXC_DSE: case EXC_DSI: case EXC_DTMISS: printf(" dsisr = 0x%lx\n", (u_long)frame->cpu.aim.dsisr); break; } #elif defined(BOOKE) vm_paddr_t pa; switch (vector) { case EXC_MCHK: pa = mfspr(SPR_MCARU); pa = (pa << 32) | (u_register_t)mfspr(SPR_MCAR); printf(" mcsr = 0x%b\n", (int)mfspr(SPR_MCSR), MCSR_BITMASK); printf(" mcar = 0x%jx\n", (uintmax_t)pa); } printf(" esr = 0x%b\n", (int)frame->cpu.booke.esr, ESR_BITMASK); #endif } static void printtrap(u_int vector, struct trapframe *frame, int isfatal, int user) { printf("\n"); printf("%s %s trap:\n", isfatal ? "fatal" : "handled", user ? "user" : "kernel"); printf("\n"); printf(" exception = 0x%x (%s)\n", vector, trapname(vector)); switch (vector) { case EXC_DSE: case EXC_DSI: case EXC_DTMISS: case EXC_ALI: case EXC_MCHK: printf(" virtual address = 0x%" PRIxPTR "\n", frame->dar); break; case EXC_ISE: case EXC_ISI: case EXC_ITMISS: printf(" virtual address = 0x%" PRIxPTR "\n", frame->srr0); break; } cpu_printtrap(vector, frame, isfatal, user); printf(" srr0 = 0x%" PRIxPTR " (0x%" PRIxPTR ")\n", frame->srr0, frame->srr0 - (register_t)(__startkernel - KERNBASE)); printf(" srr1 = 0x%lx\n", (u_long)frame->srr1); printf(" current msr = 0x%" PRIxPTR "\n", mfmsr()); printf(" lr = 0x%" PRIxPTR " (0x%" PRIxPTR ")\n", frame->lr, frame->lr - (register_t)(__startkernel - KERNBASE)); printf(" frame = %p\n", frame); printf(" curthread = %p\n", curthread); if (curthread != NULL) printf(" pid = %d, comm = %s\n", curthread->td_proc->p_pid, curthread->td_name); printf("\n"); } /* * Handles a fatal fault when we have onfault state to recover. Returns * non-zero if there was onfault recovery state available. */ static int handle_onfault(struct trapframe *frame) { struct thread *td; jmp_buf *fb; td = curthread; #if defined(__powerpc64__) || defined(BOOKE) uintptr_t dispatch = (uintptr_t)td->td_pcb->pcb_onfault; if (dispatch == 0) return (0); /* Short-circuit radix and Book-E paths. */ switch (dispatch) { case COPYFAULT: frame->srr0 = (uintptr_t)copy_fault; return (1); case FUSUFAULT: frame->srr0 = (uintptr_t)fusufault; return (1); default: break; } #endif fb = td->td_pcb->pcb_onfault; if (fb != NULL) { frame->srr0 = (*fb)->_jb[FAULTBUF_LR]; frame->fixreg[1] = (*fb)->_jb[FAULTBUF_R1]; frame->fixreg[2] = (*fb)->_jb[FAULTBUF_R2]; frame->fixreg[3] = 1; frame->cr = (*fb)->_jb[FAULTBUF_CR]; bcopy(&(*fb)->_jb[FAULTBUF_R14], &frame->fixreg[14], 18 * sizeof(register_t)); td->td_pcb->pcb_onfault = NULL; /* Returns twice, not thrice */ return (1); } return (0); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; caddr_t params; size_t argsz; int error, n, narg, i; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->fixreg[0]; + sa->original_code = sa->code; params = (caddr_t)(frame->fixreg + FIRSTARG); n = NARGREG; if (sa->code == SYS_syscall) { /* * code is first argument, * followed by actual args. */ sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, * so as to maintain quad alignment * for the rest of the args. */ if (SV_PROC_FLAG(p, SV_ILP32)) { params += sizeof(register_t); sa->code = *(register_t *) params; params += sizeof(register_t); n -= 2; } else { sa->code = *(register_t *) params; params += sizeof(register_t); n -= 1; } } if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; narg = sa->callp->sy_narg; if (SV_PROC_FLAG(p, SV_ILP32)) { argsz = sizeof(uint32_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i] & 0xffffffff; } else { argsz = sizeof(uint64_t); for (i = 0; i < n; i++) sa->args[i] = ((u_register_t *)(params))[i]; } if (narg > n) error = copyin(MOREARGS(frame->fixreg[1]), sa->args + n, (narg - n) * argsz); else error = 0; #ifdef __powerpc64__ if (SV_PROC_FLAG(p, SV_ILP32) && narg > n) { /* Expand the size of arguments copied from the stack */ for (i = narg; i >= n; i--) sa->args[i] = ((uint32_t *)(&sa->args[n]))[i-n]; } #endif if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->fixreg[FIRSTARG + 1]; } return (error); } #include "../../kern/subr_syscall.c" void syscall(struct trapframe *frame) { struct thread *td; td = curthread; td->td_frame = frame; #if defined(__powerpc64__) && defined(AIM) /* * Speculatively restore last user SLB segment, which we know is * invalid already, since we are likely to do copyin()/copyout(). */ if (td->td_pcb->pcb_cpu.aim.usr_vsid != 0) __asm __volatile ("slbmte %0, %1; isync" :: "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE)); #endif syscallenter(td); syscallret(td); } static bool trap_pfault(struct trapframe *frame, bool user, int *signo, int *ucode) { vm_offset_t eva; struct thread *td; struct proc *p; vm_map_t map; vm_prot_t ftype; int rv, is_user; td = curthread; p = td->td_proc; if (frame->exc == EXC_ISI) { eva = frame->srr0; ftype = VM_PROT_EXECUTE; if (frame->srr1 & SRR1_ISI_PFAULT) ftype |= VM_PROT_READ; } else { eva = frame->dar; #ifdef BOOKE if (frame->cpu.booke.esr & ESR_ST) #else if (frame->cpu.aim.dsisr & DSISR_STORE) #endif ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; } #if defined(__powerpc64__) && defined(AIM) if (radix_mmu && pmap_nofault(&p->p_vmspace->vm_pmap, eva, ftype) == 0) return (true); #endif if (__predict_false((td->td_pflags & TDP_NOFAULTING) == 0)) { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame); return (false); } } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; } else { rv = pmap_decode_kernel_ptr(eva, &is_user, &eva); if (rv != 0) return (false); if (is_user) map = &p->p_vmspace->vm_map; else map = kernel_map; } /* Fault in the page. */ rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); /* * XXXDTRACE: add dtrace_doubletrap_func here? */ if (rv == KERN_SUCCESS) return (true); if (!user && handle_onfault(frame)) return (true); return (false); } /* * For now, this only deals with the particular unaligned access case * that gcc tends to generate. Eventually it should handle all of the * possibilities that can happen on a 32-bit PowerPC in big-endian mode. */ static int fix_unaligned(struct thread *td, struct trapframe *frame) { struct thread *fputhread; #ifdef BOOKE uint32_t inst; #endif int indicator, reg; double *fpr; #ifdef __SPE__ indicator = (frame->cpu.booke.esr & (ESR_ST|ESR_SPE)); if (indicator & ESR_SPE) { if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_INST_RST(inst); fpr = (double *)td->td_pcb->pcb_vec.vr[reg]; fputhread = PCPU_GET(vecthread); /* Juggle the SPE to ensure that we've initialized * the registers, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_vec(fputhread); enable_vec(td); } save_vec(td); if (!(indicator & ESR_ST)) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); frame->fixreg[reg] = td->td_pcb->pcb_vec.vr[reg][1]; enable_vec(td); } else { td->td_pcb->pcb_vec.vr[reg][1] = frame->fixreg[reg]; if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); } #else #ifdef BOOKE indicator = (frame->cpu.booke.esr & ESR_ST) ? EXC_ALI_STFD : EXC_ALI_LFD; #else indicator = EXC_ALI_OPCODE_INDICATOR(frame->cpu.aim.dsisr); #endif switch (indicator) { case EXC_ALI_LFD: case EXC_ALI_STFD: #ifdef BOOKE if (copyin((void *)frame->srr0, &inst, sizeof(inst)) != 0) return (-1); reg = EXC_ALI_INST_RST(inst); #else reg = EXC_ALI_RST(frame->cpu.aim.dsisr); #endif fpr = &td->td_pcb->pcb_fpu.fpr[reg].fpr; fputhread = PCPU_GET(fputhread); /* Juggle the FPU to ensure that we've initialized * the FPRs, and that their current state is in * the PCB. */ if (fputhread != td) { if (fputhread) save_fpu(fputhread); enable_fpu(td); } save_fpu(td); if (indicator == EXC_ALI_LFD) { if (copyin((void *)frame->dar, fpr, sizeof(double)) != 0) return (-1); enable_fpu(td); } else { if (copyout(fpr, (void *)frame->dar, sizeof(double)) != 0) return (-1); } return (0); break; } #endif return (-1); } #if defined(__powerpc64__) && defined(AIM) #define MSKNSHL(x, m, n) "(((" #x ") & " #m ") << " #n ")" #define MSKNSHR(x, m, n) "(((" #x ") & " #m ") >> " #n ")" /* xvcpsgndp instruction, built in opcode format. * This can be changed to use mnemonic after a toolchain update. */ #define XVCPSGNDP(xt, xa, xb) \ __asm __volatile(".long (" \ MSKNSHL(60, 0x3f, 26) " | " \ MSKNSHL(xt, 0x1f, 21) " | " \ MSKNSHL(xa, 0x1f, 16) " | " \ MSKNSHL(xb, 0x1f, 11) " | " \ MSKNSHL(240, 0xff, 3) " | " \ MSKNSHR(xa, 0x20, 3) " | " \ MSKNSHR(xa, 0x20, 4) " | " \ MSKNSHR(xa, 0x20, 5) ")") /* Macros to normalize 1 or 10 VSX registers */ #define NORM(x) XVCPSGNDP(x, x, x) #define NORM10(x) \ NORM(x ## 0); NORM(x ## 1); NORM(x ## 2); NORM(x ## 3); NORM(x ## 4); \ NORM(x ## 5); NORM(x ## 6); NORM(x ## 7); NORM(x ## 8); NORM(x ## 9) static void normalize_inputs(void) { register_t msr; /* enable VSX */ msr = mfmsr(); mtmsr(msr | PSL_VSX); NORM(0); NORM(1); NORM(2); NORM(3); NORM(4); NORM(5); NORM(6); NORM(7); NORM(8); NORM(9); NORM10(1); NORM10(2); NORM10(3); NORM10(4); NORM10(5); NORM(60); NORM(61); NORM(62); NORM(63); /* restore MSR */ mtmsr(msr); } #endif #ifdef KDB int db_trap_glue(struct trapframe *frame) { if (!(frame->srr1 & PSL_PR) && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC || frame_is_trap_inst(frame) || frame->exc == EXC_BPT || frame->exc == EXC_DEBUG || frame->exc == EXC_DSI)) { int type = frame->exc; /* Ignore DTrace traps. */ if (*(uint32_t *)frame->srr0 == EXC_DTRACE) return (0); if (frame_is_trap_inst(frame)) { type = T_BREAKPOINT; } return (kdb_trap(type, 0, frame)); } return (0); } #endif diff --git a/sys/riscv/include/proc.h b/sys/riscv/include/proc.h index 4b5ae9ebe3ed..1c6c8d2919b5 100644 --- a/sys/riscv/include/proc.h +++ b/sys/riscv/include/proc.h @@ -1,55 +1,56 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * from: FreeBSD: src/sys/i386/include/proc.h,v 1.11 2001/06/29 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_sstatus_ie; /* (k) */ }; struct mdproc { int dummy; }; #define KINFO_PROC_SIZE 1088 #define MAXARGS 8 struct syscall_args { u_int code; + u_int original_code; struct sysent *callp; register_t args[MAXARGS]; }; #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/riscv/riscv/trap.c b/sys/riscv/riscv/trap.c index 07d7f84a94e8..8844638c8204 100644 --- a/sys/riscv/riscv/trap.c +++ b/sys/riscv/riscv/trap.c @@ -1,405 +1,406 @@ /*- * Copyright (c) 2015-2018 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #ifdef FPE #include #endif #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif int (*dtrace_invop_jump_addr)(struct trapframe *); /* Called from exception.S */ void do_trap_supervisor(struct trapframe *); void do_trap_user(struct trapframe *); static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr, int trapno) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; ksi.ksi_trapno = trapno; trapsignal(td, &ksi); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; register_t *ap, *dst_ap; struct syscall_args *sa; p = td->td_proc; sa = &td->td_sa; ap = &td->td_frame->tf_a[0]; dst_ap = &sa->args[0]; sa->code = td->td_frame->tf_t[0]; + sa->original_code = sa->code; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall)) { sa->code = *ap++; } else { *dst_ap++ = *ap++; } if (__predict_false(sa->code >= p->p_sysent->sv_size)) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Syscall %d takes too many arguments", sa->code)); memcpy(dst_ap, ap, (NARGREG - 1) * sizeof(register_t)); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" static void dump_regs(struct trapframe *frame) { int n; int i; n = nitems(frame->tf_t); for (i = 0; i < n; i++) printf("t[%d] == 0x%016lx\n", i, frame->tf_t[i]); n = nitems(frame->tf_s); for (i = 0; i < n; i++) printf("s[%d] == 0x%016lx\n", i, frame->tf_s[i]); n = nitems(frame->tf_a); for (i = 0; i < n; i++) printf("a[%d] == 0x%016lx\n", i, frame->tf_a[i]); printf("ra == 0x%016lx\n", frame->tf_ra); printf("sp == 0x%016lx\n", frame->tf_sp); printf("gp == 0x%016lx\n", frame->tf_gp); printf("tp == 0x%016lx\n", frame->tf_tp); printf("sepc == 0x%016lx\n", frame->tf_sepc); printf("sstatus == 0x%016lx\n", frame->tf_sstatus); } static void ecall_handler(void) { struct thread *td; td = curthread; syscallenter(td); syscallret(td); } static void page_fault_handler(struct trapframe *frame, int usermode) { struct vm_map *map; uint64_t stval; struct thread *td; struct pcb *pcb; vm_prot_t ftype; vm_offset_t va; struct proc *p; int error, sig, ucode; #ifdef KDB bool handled; #endif #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif td = curthread; p = td->td_proc; pcb = td->td_pcb; stval = frame->tf_stval; if (td->td_critnest != 0 || td->td_intr_nesting_level != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) goto fatal; if (usermode) { map = &td->td_proc->p_vmspace->vm_map; } else { /* * Enable interrupts for the duration of the page fault. For * user faults this was done already in do_trap_user(). */ intr_enable(); if (stval >= VM_MAX_USER_ADDRESS) { map = kernel_map; } else { if (pcb->pcb_onfault == 0) goto fatal; map = &td->td_proc->p_vmspace->vm_map; } } va = trunc_page(stval); if (frame->tf_scause == SCAUSE_STORE_PAGE_FAULT) { ftype = VM_PROT_WRITE; } else if (frame->tf_scause == SCAUSE_INST_PAGE_FAULT) { ftype = VM_PROT_EXECUTE; } else { ftype = VM_PROT_READ; } if (pmap_fault(map->pmap, va, ftype)) goto done; error = vm_fault_trap(map, va, ftype, VM_FAULT_NORMAL, &sig, &ucode); if (error != KERN_SUCCESS) { if (usermode) { call_trapsignal(td, sig, ucode, (void *)stval, frame->tf_scause & SCAUSE_CODE); } else { if (pcb->pcb_onfault != 0) { frame->tf_a[0] = error; frame->tf_sepc = pcb->pcb_onfault; return; } goto fatal; } } done: if (usermode) userret(td, frame); return; fatal: dump_regs(frame); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(frame->tf_scause & SCAUSE_CODE, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("Fatal page fault at %#lx: %#016lx", frame->tf_sepc, stval); } void do_trap_supervisor(struct trapframe *frame) { uint64_t exception; /* Ensure we came from supervisor mode, interrupts disabled */ KASSERT((csr_read(sstatus) & (SSTATUS_SPP | SSTATUS_SIE)) == SSTATUS_SPP, ("Came from S mode with interrupts enabled")); KASSERT((csr_read(sstatus) & (SSTATUS_SUM)) == 0, ("Came from S mode with SUM enabled")); exception = frame->tf_scause & SCAUSE_CODE; if ((frame->tf_scause & SCAUSE_INTR) != 0) { /* Interrupt */ riscv_cpu_intr(frame); return; } #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR3(KTR_TRAP, "do_trap_supervisor: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch (exception) { case SCAUSE_LOAD_ACCESS_FAULT: case SCAUSE_STORE_ACCESS_FAULT: case SCAUSE_INST_ACCESS_FAULT: dump_regs(frame); panic("Memory access exception at 0x%016lx\n", frame->tf_sepc); break; case SCAUSE_STORE_PAGE_FAULT: case SCAUSE_LOAD_PAGE_FAULT: case SCAUSE_INST_PAGE_FAULT: page_fault_handler(frame, 0); break; case SCAUSE_BREAKPOINT: #ifdef KDTRACE_HOOKS if (dtrace_invop_jump_addr != NULL && dtrace_invop_jump_addr(frame) == 0) break; #endif #ifdef KDB kdb_trap(exception, 0, frame); #else dump_regs(frame); panic("No debugger in kernel.\n"); #endif break; case SCAUSE_ILLEGAL_INSTRUCTION: dump_regs(frame); panic("Illegal instruction at 0x%016lx\n", frame->tf_sepc); break; default: dump_regs(frame); panic("Unknown kernel exception %lx trap value %lx\n", exception, frame->tf_stval); } } void do_trap_user(struct trapframe *frame) { uint64_t exception; struct thread *td; struct pcb *pcb; td = curthread; pcb = td->td_pcb; KASSERT(td->td_frame == frame, ("%s: td_frame %p != frame %p", __func__, td->td_frame, frame)); /* Ensure we came from usermode, interrupts disabled */ KASSERT((csr_read(sstatus) & (SSTATUS_SPP | SSTATUS_SIE)) == 0, ("Came from U mode with interrupts enabled")); KASSERT((csr_read(sstatus) & (SSTATUS_SUM)) == 0, ("Came from U mode with SUM enabled")); exception = frame->tf_scause & SCAUSE_CODE; if ((frame->tf_scause & SCAUSE_INTR) != 0) { /* Interrupt */ riscv_cpu_intr(frame); return; } intr_enable(); CTR3(KTR_TRAP, "do_trap_user: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch (exception) { case SCAUSE_LOAD_ACCESS_FAULT: case SCAUSE_STORE_ACCESS_FAULT: case SCAUSE_INST_ACCESS_FAULT: call_trapsignal(td, SIGBUS, BUS_ADRERR, (void *)frame->tf_sepc, exception); userret(td, frame); break; case SCAUSE_STORE_PAGE_FAULT: case SCAUSE_LOAD_PAGE_FAULT: case SCAUSE_INST_PAGE_FAULT: page_fault_handler(frame, 1); break; case SCAUSE_ECALL_USER: frame->tf_sepc += 4; /* Next instruction */ ecall_handler(); break; case SCAUSE_ILLEGAL_INSTRUCTION: #ifdef FPE if ((pcb->pcb_fpflags & PCB_FP_STARTED) == 0) { /* * May be a FPE trap. Enable FPE usage * for this thread and try again. */ fpe_state_clear(); frame->tf_sstatus &= ~SSTATUS_FS_MASK; frame->tf_sstatus |= SSTATUS_FS_CLEAN; pcb->pcb_fpflags |= PCB_FP_STARTED; break; } #endif call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)frame->tf_sepc, exception); userret(td, frame); break; case SCAUSE_BREAKPOINT: call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_sepc, exception); userret(td, frame); break; default: dump_regs(frame); panic("Unknown userland exception %lx, trap value %lx\n", exception, frame->tf_stval); } } diff --git a/sys/sys/signal.h b/sys/sys/signal.h index 8b45a521c3ee..9dae3ce04745 100644 --- a/sys/sys/signal.h +++ b/sys/sys/signal.h @@ -1,514 +1,521 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)signal.h 8.4 (Berkeley) 5/4/95 * $FreeBSD$ */ #ifndef _SYS_SIGNAL_H_ #define _SYS_SIGNAL_H_ #include #include #include #include /* __MINSIGSTKSZ */ #include /* sig_atomic_t; trap codes; sigcontext */ #if __POSIX_VISIBLE >= 200809 #include #include #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif /* __POSIX_VISIBLE >= 200809 */ /* * System defined signals. */ #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGHUP 1 /* hangup */ #endif #define SIGINT 2 /* interrupt */ #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGQUIT 3 /* quit */ #endif #define SIGILL 4 /* illegal instr. (not reset when caught) */ #if __XSI_VISIBLE #define SIGTRAP 5 /* trace trap (not reset when caught) */ #endif #define SIGABRT 6 /* abort() */ #if __BSD_VISIBLE #define SIGIOT SIGABRT /* compatibility */ #define SIGEMT 7 /* EMT instruction */ #endif #define SIGFPE 8 /* floating point exception */ #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGKILL 9 /* kill (cannot be caught or ignored) */ #endif #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE #define SIGBUS 10 /* bus error */ #endif #define SIGSEGV 11 /* segmentation violation */ #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE #define SIGSYS 12 /* non-existent system call invoked */ #endif #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGPIPE 13 /* write on a pipe with no one to read it */ #define SIGALRM 14 /* alarm clock */ #endif #define SIGTERM 15 /* software termination signal from kill */ #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE #define SIGURG 16 /* urgent condition on IO channel */ #endif #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGSTOP 17 /* sendable stop signal not from tty */ #define SIGTSTP 18 /* stop signal from tty */ #define SIGCONT 19 /* continue a stopped process */ #define SIGCHLD 20 /* to parent on child stop or exit */ #define SIGTTIN 21 /* to readers pgrp upon background tty read */ #define SIGTTOU 22 /* like TTIN if (tp->t_local<OSTOP) */ #endif #if __BSD_VISIBLE #define SIGIO 23 /* input/output possible signal */ #endif #if __XSI_VISIBLE #define SIGXCPU 24 /* exceeded CPU time limit */ #define SIGXFSZ 25 /* exceeded file size limit */ #define SIGVTALRM 26 /* virtual time alarm */ #define SIGPROF 27 /* profiling time alarm */ #endif #if __BSD_VISIBLE #define SIGWINCH 28 /* window size changes */ #define SIGINFO 29 /* information request */ #endif #if __POSIX_VISIBLE || __XSI_VISIBLE #define SIGUSR1 30 /* user defined signal 1 */ #define SIGUSR2 31 /* user defined signal 2 */ #endif #if __BSD_VISIBLE #define SIGTHR 32 /* reserved by thread library. */ #define SIGLWP SIGTHR #define SIGLIBRT 33 /* reserved by real-time library. */ #endif #define SIGRTMIN 65 #define SIGRTMAX 126 #define SIG_DFL ((__sighandler_t *)0) #define SIG_IGN ((__sighandler_t *)1) #define SIG_ERR ((__sighandler_t *)-1) /* #define SIG_CATCH ((__sighandler_t *)2) See signalvar.h */ #define SIG_HOLD ((__sighandler_t *)3) /* * Type of a signal handling function. * * Language spec sez signal handlers take exactly one arg, even though we * actually supply three. Ugh! * * We don't try to hide the difference by leaving out the args because * that would cause warnings about conformant programs. Nonconformant * programs can avoid the warnings by casting to (__sighandler_t *) or * sig_t before calling signal() or assigning to sa_handler or sv_handler. * * The kernel should reverse the cast before calling the function. It * has no way to do this, but on most machines 1-arg and 3-arg functions * have the same calling protocol so there is no problem in practice. * A bit in sa_flags could be used to specify the number of args. */ typedef void __sighandler_t(int); #if __POSIX_VISIBLE || __XSI_VISIBLE #ifndef _SIGSET_T_DECLARED #define _SIGSET_T_DECLARED typedef __sigset_t sigset_t; #endif #endif #if __POSIX_VISIBLE >= 199309 || __XSI_VISIBLE >= 500 union sigval { /* Members as suggested by Annex C of POSIX 1003.1b. */ int sival_int; void *sival_ptr; /* 6.0 compatibility */ int sigval_int; void *sigval_ptr; }; #if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__)) union sigval32 { int sival_int; uint32_t sival_ptr; /* 6.0 compatibility */ int sigval_int; uint32_t sigval_ptr; }; #endif #endif #if __POSIX_VISIBLE >= 199309 struct pthread_attr; struct sigevent { int sigev_notify; /* Notification type */ int sigev_signo; /* Signal number */ union sigval sigev_value; /* Signal value */ union { __lwpid_t _threadid; struct { void (*_function)(union sigval); struct pthread_attr **_attribute; } _sigev_thread; unsigned short _kevent_flags; long __spare__[8]; } _sigev_un; }; #if __BSD_VISIBLE #define sigev_notify_kqueue sigev_signo #define sigev_notify_kevent_flags _sigev_un._kevent_flags #define sigev_notify_thread_id _sigev_un._threadid #endif #define sigev_notify_function _sigev_un._sigev_thread._function #define sigev_notify_attributes _sigev_un._sigev_thread._attribute #define SIGEV_NONE 0 /* No async notification. */ #define SIGEV_SIGNAL 1 /* Generate a queued signal. */ #define SIGEV_THREAD 2 /* Call back from another pthread. */ #if __BSD_VISIBLE #define SIGEV_KEVENT 3 /* Generate a kevent. */ #define SIGEV_THREAD_ID 4 /* Send signal to a kernel thread. */ #endif #endif /* __POSIX_VISIBLE >= 199309 */ #if __POSIX_VISIBLE >= 199309 || __XSI_VISIBLE typedef struct __siginfo { int si_signo; /* signal number */ int si_errno; /* errno association */ /* * Cause of signal, one of the SI_ macros or signal-specific * values, i.e. one of the FPE_... values for SIGFPE. This * value is equivalent to the second argument to an old-style * FreeBSD signal handler. */ int si_code; /* signal code */ __pid_t si_pid; /* sending process */ __uid_t si_uid; /* sender's ruid */ int si_status; /* exit value */ void *si_addr; /* faulting instruction */ union sigval si_value; /* signal value */ union { struct { int _trapno;/* machine specific trap code */ } _fault; struct { int _timerid; int _overrun; } _timer; struct { int _mqd; } _mesgq; struct { long _band; /* band event for SIGPOLL */ } _poll; /* was this ever used ? */ + struct { + int _syscall; /* Syscall number for signals + * delivered as a result of + * system calls denied by + * Capsicum. */ + } _capsicum; struct { long __spare1__; int __spare2__[7]; } __spare__; } _reason; } siginfo_t; #define si_trapno _reason._fault._trapno #define si_timerid _reason._timer._timerid #define si_overrun _reason._timer._overrun #define si_mqd _reason._mesgq._mqd #define si_band _reason._poll._band +#define si_syscall _reason._capsicum._syscall #if defined(_WANT_LWPINFO32) || (defined(_KERNEL) && defined(__LP64__)) struct siginfo32 { int si_signo; /* signal number */ int si_errno; /* errno association */ int si_code; /* signal code */ __pid_t si_pid; /* sending process */ __uid_t si_uid; /* sender's ruid */ int si_status; /* exit value */ uint32_t si_addr; /* faulting instruction */ union sigval32 si_value; /* signal value */ union { struct { int _trapno;/* machine specific trap code */ } _fault; struct { int _timerid; int _overrun; } _timer; struct { int _mqd; } _mesgq; struct { int32_t _band; /* band event for SIGPOLL */ } _poll; /* was this ever used ? */ struct { int32_t __spare1__; int __spare2__[7]; } __spare__; } _reason; }; #endif /** si_code **/ /* codes for SIGILL */ #define ILL_ILLOPC 1 /* Illegal opcode. */ #define ILL_ILLOPN 2 /* Illegal operand. */ #define ILL_ILLADR 3 /* Illegal addressing mode. */ #define ILL_ILLTRP 4 /* Illegal trap. */ #define ILL_PRVOPC 5 /* Privileged opcode. */ #define ILL_PRVREG 6 /* Privileged register. */ #define ILL_COPROC 7 /* Coprocessor error. */ #define ILL_BADSTK 8 /* Internal stack error. */ /* codes for SIGBUS */ #define BUS_ADRALN 1 /* Invalid address alignment. */ #define BUS_ADRERR 2 /* Nonexistent physical address. */ #define BUS_OBJERR 3 /* Object-specific hardware error. */ #define BUS_OOMERR 100 /* Non-standard: No memory. */ /* codes for SIGSEGV */ #define SEGV_MAPERR 1 /* Address not mapped to object. */ #define SEGV_ACCERR 2 /* Invalid permissions for mapped */ /* object. */ #define SEGV_PKUERR 100 /* x86: PKU violation */ /* codes for SIGFPE */ #define FPE_INTOVF 1 /* Integer overflow. */ #define FPE_INTDIV 2 /* Integer divide by zero. */ #define FPE_FLTDIV 3 /* Floating point divide by zero. */ #define FPE_FLTOVF 4 /* Floating point overflow. */ #define FPE_FLTUND 5 /* Floating point underflow. */ #define FPE_FLTRES 6 /* Floating point inexact result. */ #define FPE_FLTINV 7 /* Invalid floating point operation. */ #define FPE_FLTSUB 8 /* Subscript out of range. */ /* codes for SIGTRAP */ #define TRAP_BRKPT 1 /* Process breakpoint. */ #define TRAP_TRACE 2 /* Process trace trap. */ #define TRAP_DTRACE 3 /* DTrace induced trap. */ #define TRAP_CAP 4 /* Capabilities protective trap. */ /* codes for SIGCHLD */ #define CLD_EXITED 1 /* Child has exited */ #define CLD_KILLED 2 /* Child has terminated abnormally but */ /* did not create a core file */ #define CLD_DUMPED 3 /* Child has terminated abnormally and */ /* created a core file */ #define CLD_TRAPPED 4 /* Traced child has trapped */ #define CLD_STOPPED 5 /* Child has stopped */ #define CLD_CONTINUED 6 /* Stopped child has continued */ /* codes for SIGPOLL */ #define POLL_IN 1 /* Data input available */ #define POLL_OUT 2 /* Output buffers available */ #define POLL_MSG 3 /* Input message available */ #define POLL_ERR 4 /* I/O Error */ #define POLL_PRI 5 /* High priority input available */ #define POLL_HUP 6 /* Device disconnected */ #endif #if __POSIX_VISIBLE || __XSI_VISIBLE struct __siginfo; /* * Signal vector "template" used in sigaction call. */ struct sigaction { union { void (*__sa_handler)(int); void (*__sa_sigaction)(int, struct __siginfo *, void *); } __sigaction_u; /* signal handler */ int sa_flags; /* see signal options below */ sigset_t sa_mask; /* signal mask to apply */ }; #define sa_handler __sigaction_u.__sa_handler #endif #if __XSI_VISIBLE /* If SA_SIGINFO is set, sa_sigaction must be used instead of sa_handler. */ #define sa_sigaction __sigaction_u.__sa_sigaction #endif #if __POSIX_VISIBLE || __XSI_VISIBLE #define SA_NOCLDSTOP 0x0008 /* do not generate SIGCHLD on child stop */ #endif /* __POSIX_VISIBLE || __XSI_VISIBLE */ #if __XSI_VISIBLE #define SA_ONSTACK 0x0001 /* take signal on signal stack */ #define SA_RESTART 0x0002 /* restart system call on signal return */ #define SA_RESETHAND 0x0004 /* reset to SIG_DFL when taking signal */ #define SA_NODEFER 0x0010 /* don't mask the signal we're delivering */ #define SA_NOCLDWAIT 0x0020 /* don't keep zombies around */ #define SA_SIGINFO 0x0040 /* signal handler with SA_SIGINFO args */ #endif #if __BSD_VISIBLE #define NSIG 32 /* number of old signals (counting 0) */ #endif #if __POSIX_VISIBLE || __XSI_VISIBLE #define SI_NOINFO 0 /* No signal info besides si_signo. */ #define SI_USER 0x10001 /* Signal sent by kill(). */ #define SI_QUEUE 0x10002 /* Signal sent by the sigqueue(). */ #define SI_TIMER 0x10003 /* Signal generated by expiration of */ /* a timer set by timer_settime(). */ #define SI_ASYNCIO 0x10004 /* Signal generated by completion of */ /* an asynchronous I/O request.*/ #define SI_MESGQ 0x10005 /* Signal generated by arrival of a */ /* message on an empty message queue. */ #define SI_KERNEL 0x10006 #define SI_LWP 0x10007 /* Signal sent by thr_kill */ #endif #if __BSD_VISIBLE #define SI_UNDEFINED 0 #endif #if __BSD_VISIBLE typedef __sighandler_t *sig_t; /* type of pointer to a signal function */ typedef void __siginfohandler_t(int, struct __siginfo *, void *); #endif #if __XSI_VISIBLE #if __BSD_VISIBLE #define __stack_t sigaltstack #endif typedef struct __stack_t stack_t; #define SS_ONSTACK 0x0001 /* take signal on alternate stack */ #define SS_DISABLE 0x0004 /* disable taking signals on alternate stack */ #define MINSIGSTKSZ __MINSIGSTKSZ /* minimum stack size */ #define SIGSTKSZ (MINSIGSTKSZ + 32768) /* recommended stack size */ #endif /* * Structure used in sigaltstack call. Its definition is always * needed for __ucontext. If __BSD_VISIBLE is defined, the structure * tag is actually sigaltstack. */ struct __stack_t { void *ss_sp; /* signal stack base */ __size_t ss_size; /* signal stack length */ int ss_flags; /* SS_DISABLE and/or SS_ONSTACK */ }; #if __BSD_VISIBLE /* * 4.3 compatibility: * Signal vector "template" used in sigvec call. */ struct sigvec { __sighandler_t *sv_handler; /* signal handler */ int sv_mask; /* signal mask to apply */ int sv_flags; /* see signal options below */ }; #define SV_ONSTACK SA_ONSTACK #define SV_INTERRUPT SA_RESTART /* same bit, opposite sense */ #define SV_RESETHAND SA_RESETHAND #define SV_NODEFER SA_NODEFER #define SV_NOCLDSTOP SA_NOCLDSTOP #define SV_SIGINFO SA_SIGINFO #define sv_onstack sv_flags /* isn't compatibility wonderful! */ #endif /* Keep this in one place only */ #if defined(_KERNEL) && defined(COMPAT_43) && \ !defined(__i386__) struct osigcontext { int _not_used; }; #endif #if __XSI_VISIBLE /* * Structure used in sigstack call. */ struct sigstack { void *ss_sp; /* signal stack pointer */ int ss_onstack; /* current status */ }; #endif #if __BSD_VISIBLE || __POSIX_VISIBLE > 0 && __POSIX_VISIBLE <= 200112 /* * Macro for converting signal number to a mask suitable for * sigblock(). */ #define sigmask(m) (1 << ((m)-1)) #endif #if __BSD_VISIBLE #define BADSIG SIG_ERR #endif #if __POSIX_VISIBLE || __XSI_VISIBLE /* * Flags for sigprocmask: */ #define SIG_BLOCK 1 /* block specified signal set */ #define SIG_UNBLOCK 2 /* unblock specified signal set */ #define SIG_SETMASK 3 /* set specified signal set */ #endif /* * For historical reasons; programs expect signal's return value to be * defined by . */ __BEGIN_DECLS __sighandler_t *signal(int, __sighandler_t *); __END_DECLS #endif /* !_SYS_SIGNAL_H_ */