diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c index 48bda05f9685..168c24cbb65b 100644 --- a/sys/amd64/amd64/exec_machdep.c +++ b/sys/amd64/amd64/exec_machdep.c @@ -1,983 +1,978 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include static void get_fpcontext(struct thread *td, mcontext_t *mcp, - char *xfpusave, size_t xfpusave_len); + char **xfpusave, size_t *xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored at top to call routine, * followed by call to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the frame pointer, it * returns to the user specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); - if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { - xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); - xfpusave = (char *)td->td_md.md_fpu_scratch; - } else { - xfpusave_len = 0; - xfpusave = NULL; - } - /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ - get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); + get_fpcontext(td, &sf.sf_uc.uc_mcontext, &xfpusave, &xfpusave_len); fpstate_drop(td); update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } - xfpustate = __builtin_alloca(xfpustate_len); + xfpustate = (char *)fpu_save_area_alloc(); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { + fpu_save_area_free((struct savefpu *)xfpustate); uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); + fpu_save_area_free((struct savefpu *)xfpustate); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ void x86_clear_dbregs(struct pcb *pcb) { if ((pcb->pcb_flags & PCB_DBREGS) == 0) return; pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running CPU, * otherwise they will end up affecting the next * process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } regs->r_err = 0; regs->r_trapno = 0; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { critical_enter(); set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); critical_exit(); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); - xfpustate = (char *)td->td_md.md_fpu_scratch; + xfpustate = (char *)fpu_save_area_alloc(); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); - if (ret != 0) + if (ret != 0) { + fpu_save_area_free((struct savefpu *)xfpustate); return (ret); + } } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); + fpu_save_area_free((struct savefpu *)xfpustate); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } return (0); } static void -get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, - size_t xfpusave_len) +get_fpcontext(struct thread *td, mcontext_t *mcp, char **xfpusave, + size_t *xfpusave_len) { - size_t max_len, len; - mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); - if (!use_xsave || xfpusave_len == 0) + if (xfpusave == NULL) return; - max_len = cpu_max_ext_state_size - sizeof(struct savefpu); - len = xfpusave_len; - if (len > max_len) { - len = max_len; - bzero(xfpusave + max_len, len - max_len); + if (!use_xsave || cpu_max_ext_state_size <= sizeof(struct savefpu)) { + *xfpusave_len = 0; + *xfpusave = NULL; + } else { + mcp->mc_flags |= _MC_HASFPXSTATE; + *xfpusave_len = mcp->mc_xfpustate_len = + cpu_max_ext_state_size - sizeof(struct savefpu); + *xfpusave = (char *)(get_pcb_user_save_td(td) + 1); } - mcp->mc_flags |= _MC_HASFPXSTATE; - mcp->mc_xfpustate_len = len; - bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(register_t dr6) { u_int64_t dr7; u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; bp = dr6 & DBREG_DR6_BMASK; if (bp == 0) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return (0); } dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return (0); } nbp = 0; /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return (nbp); } } /* * None of the breakpoints are in user space. */ return (0); } diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index e42d16d61b3a..0bfcd03d6655 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -1,742 +1,739 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ */ #include __FBSDID("$FreeBSD$"); #include "opt_isa.h" #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf."); void set_top_of_stack_td(struct thread *td) { td->td_md.md_stack_base = td->td_kstack + td->td_kstack_pages * PAGE_SIZE; } struct savefpu * get_pcb_user_save_td(struct thread *td) { KASSERT(((vm_offset_t)td->td_md.md_usr_fpu_save % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area ptr %p td %p", td->td_md.md_usr_fpu_save, td)); return (td->td_md.md_usr_fpu_save); } struct pcb * get_pcb_td(struct thread *td) { return (&td->td_md.md_pcb); } struct savefpu * get_pcb_user_save_pcb(struct pcb *pcb) { struct thread *td; td = __containerof(pcb, struct thread, td_md.md_pcb); return (get_pcb_user_save_td(td)); } void * alloc_fpusave(int flags) { void *res; struct savefpu_ymm *sf; res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); if (use_xsave) { sf = (struct savefpu_ymm *)res; bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd)); sf->sv_xstate.sx_hd.xstate_bv = xsave_mask; } return (res); } /* * Common code shared between cpu_fork() and cpu_copy_thread() for * initializing a thread. */ static void copy_thread(struct thread *td1, struct thread *td2) { struct pcb *pcb2; pcb2 = td2->td_pcb; /* Ensure that td1's pcb is up to date for user threads. */ if ((td2->td_pflags & TDP_KTHREAD) == 0) { MPASS(td1 == curthread); fpuexit(td1); update_pcb_bases(td1->td_pcb); } /* Copy td1's pcb */ bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); /* Properly initialize pcb_save */ pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); /* Kernel threads start with clean FPU and segment bases. */ if ((td2->td_pflags & TDP_KTHREAD) != 0) { pcb2->pcb_fsbase = 0; pcb2->pcb_gsbase = 0; clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE | PCB_KERNFPU | PCB_KERNFPU_THR); } else { MPASS((pcb2->pcb_flags & (PCB_KERNFPU | PCB_KERNFPU_THR)) == 0); bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2), cpu_max_ext_state_size); } /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */ pcb2->pcb_rbp = 0; pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *); pcb2->pcb_rbx = (register_t)td2; /* fork_trampoline argument */ pcb2->pcb_rip = (register_t)fork_trampoline; /*- * pcb2->pcb_dr*: cloned above. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above. * pcb2->pcb_onfault: cloned above (always NULL here?). * pcb2->pcb_[fg]sbase: cloned above */ pcb2->pcb_tssp = NULL; /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; pmap_thread_init_invl_gen(td2); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct proc *p1; struct pcb *pcb2; struct mdproc *mdp1, *mdp2; struct proc_ldt *pldt; p1 = td1->td_proc; if ((flags & RFPROC) == 0) { if ((flags & RFMEM) == 0) { /* unshare user LDT */ mdp1 = &p1->p_md; mtx_lock(&dt_lock); if ((pldt = mdp1->md_ldt) != NULL && pldt->ldt_refcnt > 1 && user_ldt_alloc(p1, 1) == NULL) panic("could not copy LDT"); mtx_unlock(&dt_lock); } return; } /* Point the stack and pcb to the actual location */ set_top_of_stack_td(td2); td2->td_pcb = pcb2 = get_pcb_td(td2); copy_thread(td1, td2); /* Reset debug registers in the new process */ x86_clear_dbregs(pcb2); /* Point mdproc and then copy over p1's contents */ mdp2 = &p2->p_md; bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); /* * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. */ td2->td_frame = (struct trapframe *)td2->td_md.md_stack_base - 1; bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); /* Set child return values. */ p2->p_sysent->sv_set_fork_retval(td2); /* * If the parent process has the trap bit set (i.e. a debugger * had single stepped the process to the system call), we need * to clear the trap flag from the new frame. */ td2->td_frame->tf_rflags &= ~PSL_T; /* As on i386, do not copy io permission bitmap. */ pcb2->pcb_tssp = NULL; /* New segment registers. */ set_pcb_flags_raw(pcb2, PCB_FULL_IRET); /* Copy the LDT, if necessary. */ mdp1 = &td1->td_proc->p_md; mdp2 = &p2->p_md; if (mdp1->md_ldt == NULL) { mdp2->md_ldt = NULL; return; } mtx_lock(&dt_lock); if (mdp1->md_ldt != NULL) { if (flags & RFMEM) { mdp1->md_ldt->ldt_refcnt++; mdp2->md_ldt = mdp1->md_ldt; bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct system_segment_descriptor)); } else { mdp2->md_ldt = NULL; mdp2->md_ldt = user_ldt_alloc(p2, 0); if (mdp2->md_ldt == NULL) panic("could not copy LDT"); amd64_set_ldt_data(td2, 0, max_ldt_segment, (struct user_segment_descriptor *) mdp1->md_ldt->ldt_base); } } else mdp2->md_ldt = NULL; mtx_unlock(&dt_lock); /* * Now, cpu_switch() can schedule the new process. * pcb_rsp is loaded pointing to the cpu_switch() stack frame * containing the return address when exiting cpu_switch. * This will normally be to fork_trampoline(), which will have * %rbx loaded with the new proc's pointer. fork_trampoline() * will set up a stack to call fork_return(p, frame); to complete * the return to user-mode. */ } void x86_set_fork_retval(struct thread *td) { struct trapframe *frame = td->td_frame; frame->tf_rax = 0; /* Child returns zero */ frame->tf_rflags &= ~PSL_C; /* success */ frame->tf_rdx = 1; /* System V emulation */ } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) { /* * Note that the trap frame follows the args, so the function * is really called like this: func(arg, frame); */ td->td_pcb->pcb_r12 = (long) func; /* function */ td->td_pcb->pcb_rbx = (long) arg; /* first arg */ } void cpu_exit(struct thread *td) { /* * If this process has a custom LDT, release it. */ if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); } void cpu_thread_exit(struct thread *td) { struct pcb *pcb; critical_enter(); if (td == PCPU_GET(fpcurthread)) fpudrop(); critical_exit(); pcb = td->td_pcb; /* Disable any hardware breakpoints. */ if (pcb->pcb_flags & PCB_DBREGS) { reset_dbregs(); clear_pcb_flags(pcb, PCB_DBREGS); } } void cpu_thread_clean(struct thread *td) { struct pcb *pcb; pcb = td->td_pcb; /* * Clean TSS/iomap */ if (pcb->pcb_tssp != NULL) { pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp, (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1)); kmem_free((vm_offset_t)pcb->pcb_tssp, ctob(IOPAGES + 1)); pcb->pcb_tssp = NULL; } } void cpu_thread_swapin(struct thread *td) { } void cpu_thread_swapout(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { struct pcb *pcb; struct xstate_hdr *xhdr; set_top_of_stack_td(td); td->td_pcb = pcb = get_pcb_td(td); td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1; td->td_md.md_usr_fpu_save = fpu_save_area_alloc(); - td->td_md.md_fpu_scratch = fpu_save_area_alloc(); pcb->pcb_save = get_pcb_user_save_pcb(pcb); if (use_xsave) { xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); bzero(xhdr, sizeof(*xhdr)); xhdr->xstate_bv = xsave_mask; } } void cpu_thread_free(struct thread *td) { cpu_thread_clean(td); fpu_save_area_free(td->td_md.md_usr_fpu_save); td->td_md.md_usr_fpu_save = NULL; - fpu_save_area_free(td->td_md.md_fpu_scratch); - td->td_md.md_fpu_scratch = NULL; } bool cpu_exec_vmspace_reuse(struct proc *p, vm_map_t map) { return (((curproc->p_md.md_flags & P_MD_KPTI) != 0) == (vm_map_pmap(map)->pm_ucr3 != PMAP_NO_CR3)); } static void cpu_procctl_kpti_ctl(struct proc *p, int val) { if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC) p->p_md.md_flags |= P_MD_KPTI; if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC) p->p_md.md_flags &= ~P_MD_KPTI; } static void cpu_procctl_kpti_status(struct proc *p, int *val) { *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ? PROC_KPTI_CTL_ENABLE_ON_EXEC: PROC_KPTI_CTL_DISABLE_ON_EXEC; if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3) *val |= PROC_KPTI_STATUS_ACTIVE; } static int cpu_procctl_la_ctl(struct proc *p, int val) { int error; error = 0; switch (val) { case PROC_LA_CTL_LA48_ON_EXEC: p->p_md.md_flags |= P_MD_LA48; p->p_md.md_flags &= ~P_MD_LA57; break; case PROC_LA_CTL_LA57_ON_EXEC: if (la57) { p->p_md.md_flags &= ~P_MD_LA48; p->p_md.md_flags |= P_MD_LA57; } else { error = ENOTSUP; } break; case PROC_LA_CTL_DEFAULT_ON_EXEC: p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57); break; } return (error); } static void cpu_procctl_la_status(struct proc *p, int *val) { int res; if ((p->p_md.md_flags & P_MD_LA48) != 0) res = PROC_LA_CTL_LA48_ON_EXEC; else if ((p->p_md.md_flags & P_MD_LA57) != 0) res = PROC_LA_CTL_LA57_ON_EXEC; else res = PROC_LA_CTL_DEFAULT_ON_EXEC; if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48) res |= PROC_LA_STATUS_LA48; else res |= PROC_LA_STATUS_LA57; *val = res; } int cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data) { struct proc *p; int error, val; switch (com) { case PROC_KPTI_CTL: case PROC_KPTI_STATUS: case PROC_LA_CTL: case PROC_LA_STATUS: if (idtype != P_PID) { error = EINVAL; break; } if (com == PROC_KPTI_CTL) { /* sad but true and not a joke */ error = priv_check(td, PRIV_IO); if (error != 0) break; } if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) { error = copyin(data, &val, sizeof(val)); if (error != 0) break; } if (com == PROC_KPTI_CTL && val != PROC_KPTI_CTL_ENABLE_ON_EXEC && val != PROC_KPTI_CTL_DISABLE_ON_EXEC) { error = EINVAL; break; } if (com == PROC_LA_CTL && val != PROC_LA_CTL_LA48_ON_EXEC && val != PROC_LA_CTL_LA57_ON_EXEC && val != PROC_LA_CTL_DEFAULT_ON_EXEC) { error = EINVAL; break; } error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p); if (error != 0) break; switch (com) { case PROC_KPTI_CTL: cpu_procctl_kpti_ctl(p, val); break; case PROC_KPTI_STATUS: cpu_procctl_kpti_status(p, &val); break; case PROC_LA_CTL: error = cpu_procctl_la_ctl(p, val); break; case PROC_LA_STATUS: cpu_procctl_la_status(p, &val); break; } PROC_UNLOCK(p); if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS) error = copyout(&val, data, sizeof(val)); break; default: error = EINVAL; break; } return (error); } void cpu_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; if (__predict_true(error == 0)) { frame->tf_rax = td->td_retval[0]; frame->tf_rdx = td->td_retval[1]; frame->tf_rflags &= ~PSL_C; return; } switch (error) { case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes, * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. * We saved this in tf_err. * %r10 (which was holding the value of %rcx) is restored * for the next iteration. * %r10 restore is only required for freebsd/amd64 processes, * but shall be innocent for any ia32 ABI. * * Require full context restore to get the arguments * in the registers reloaded at return to usermode. */ frame->tf_rip -= frame->tf_err; frame->tf_r10 = frame->tf_rcx; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); break; case EJUSTRETURN: break; default: frame->tf_rax = error; frame->tf_rflags |= PSL_C; break; } } /* * Initialize machine state, mostly pcb and trap frame for a new * thread, about to return to userspace. Put enough state in the new * thread's PCB to get it to go back to the fork_return(), which * finalizes the thread state and handles peculiarities of the first * return to userspace for the new thread. */ void cpu_copy_thread(struct thread *td, struct thread *td0) { copy_thread(td0, td); /* * Copy user general-purpose registers. * * Some of these registers are rewritten by cpu_set_upcall() * and linux_set_upcall(). */ bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); /* If the current thread has the trap bit set (i.e. a debugger had * single stepped the process to the system call), we need to clear * the trap flag from the new frame. Otherwise, the new thread will * receive a (likely unexpected) SIGTRAP when it executes the first * instruction after returning to userland. */ td->td_frame->tf_rflags &= ~PSL_T; set_pcb_flags_raw(td->td_pcb, PCB_FULL_IRET); } /* * Set that machine state for performing an upcall that starts * the entry function with the given argument. */ void cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { /* * Do any extra cleaning that needs to be done. * The thread may have optional components * that are not present in a fresh thread. * This may be a recycled thread so make it look * as though it's newly allocated. */ cpu_thread_clean(td); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { /* * Set the trap frame to point at the beginning of the entry * function. */ td->td_frame->tf_rbp = 0; td->td_frame->tf_rsp = (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4; td->td_frame->tf_rip = (uintptr_t)entry; /* Return address sentinel value to stop stack unwinding. */ suword32((void *)td->td_frame->tf_rsp, 0); /* Pass the argument to the entry point. */ suword32((void *)(td->td_frame->tf_rsp + sizeof(int32_t)), (uint32_t)(uintptr_t)arg); return; } #endif /* * Set the trap frame to point at the beginning of the uts * function. */ td->td_frame->tf_rbp = 0; td->td_frame->tf_rsp = ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f; td->td_frame->tf_rsp -= 8; td->td_frame->tf_rip = (register_t)entry; td->td_frame->tf_ds = _udatasel; td->td_frame->tf_es = _udatasel; td->td_frame->tf_fs = _ufssel; td->td_frame->tf_gs = _ugssel; td->td_frame->tf_flags = TF_HASSEGS; /* Return address sentinel value to stop stack unwinding. */ suword((void *)td->td_frame->tf_rsp, 0); /* Pass the argument to the entry point. */ td->td_frame->tf_rdi = (register_t)arg; } int cpu_set_user_tls(struct thread *td, void *tls_base) { struct pcb *pcb; if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS) return (EINVAL); pcb = td->td_pcb; set_pcb_flags(pcb, PCB_FULL_IRET); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { pcb->pcb_gsbase = (register_t)tls_base; return (0); } #endif pcb->pcb_fsbase = (register_t)tls_base; return (0); } /* * Software interrupt handler for queued VM system processing. */ void swi_vm(void *dummy) { if (busdma_swi_pending != 0) busdma_swi(); } /* * Tell whether this address is in some physical memory region. * Currently used by the kernel coredump code in order to avoid * dumping the ``ISA memory hole'' which could cause indefinite hangs, * or other unpredictable behaviour. */ int is_physical_memory(vm_paddr_t addr) { #ifdef DEV_ISA /* The ISA ``memory hole''. */ if (addr >= 0xa0000 && addr < 0x100000) return 0; #endif /* * stuff other tests for known memory-mapped devices (PCI?) * here */ return 1; } diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 9b67c7001a87..1ca19072a1dc 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -1,974 +1,967 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2003 Peter Wemm * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD4 static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *); #endif static void ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp, - char *xfpusave, size_t xfpusave_len) + char **xfpusave, size_t *xfpusave_len) { - size_t max_len, len; - /* * XXX Format of 64bit and 32bit FXSAVE areas differs. FXSAVE * in 32bit mode saves %cs and %ds, while on 64bit it saves * 64bit instruction and data pointers. Ignore the difference * for now, it should be irrelevant for most applications. */ mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); - if (!use_xsave || xfpusave_len == 0) - return; - max_len = cpu_max_ext_state_size - sizeof(struct savefpu); - len = xfpusave_len; - if (len > max_len) { - len = max_len; - bzero(xfpusave + max_len, len - max_len); + if (!use_xsave || cpu_max_ext_state_size <= sizeof(struct savefpu)) { + *xfpusave_len = 0; + *xfpusave = NULL; + } else { + mcp->mc_flags |= _MC_IA32_HASFPXSTATE; + *xfpusave_len = mcp->mc_xfpustate_len = + cpu_max_ext_state_size - sizeof(struct savefpu); + *xfpusave = (char *)(get_pcb_user_save_td(td) + 1); } - mcp->mc_flags |= _MC_IA32_HASFPXSTATE; - mcp->mc_xfpustate_len = len; - bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int ia32_set_fpcontext(struct thread *td, struct ia32_mcontext *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } /* * Get machine context. */ static int ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); /* Entry into kernel always sets TF_HASSEGS */ mcp->mc_gs = tp->tf_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_rdi; mcp->mc_esi = tp->tf_rsi; mcp->mc_ebp = tp->tf_rbp; mcp->mc_isp = tp->tf_rsp; mcp->mc_eflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_rax; mcp->mc_edx = tp->tf_rdx; } mcp->mc_ebx = tp->tf_rbx; mcp->mc_ecx = tp->tf_rcx; mcp->mc_eip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); mcp->mc_flags = tp->tf_flags; ia32_get_fpcontext(td, mcp, NULL, 0); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ static int ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp) { struct trapframe *tp; char *xfpustate; long rflags; int ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp)) return (EINVAL); rflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_IA32_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); - xfpustate = (char *)td->td_md.md_fpu_scratch; + xfpustate = (char *)fpu_save_area_alloc(); ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate, mcp->mc_xfpustate_len); - if (ret != 0) + if (ret != 0) { + fpu_save_area_free((struct savefpu *)xfpustate); return (ret); + } } else xfpustate = NULL; ret = ia32_set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); + fpu_save_area_free((struct savefpu *)xfpustate); if (ret != 0) return (ret); tp->tf_gs = mcp->mc_gs; tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_flags = TF_HASSEGS; tp->tf_rdi = mcp->mc_edi; tp->tf_rsi = mcp->mc_esi; tp->tf_rbp = mcp->mc_ebp; tp->tf_rbx = mcp->mc_ebx; tp->tf_rdx = mcp->mc_edx; tp->tf_rcx = mcp->mc_ecx; tp->tf_rax = mcp->mc_eax; /* trapno, err */ tp->tf_rip = mcp->mc_eip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* * The first two fields of a ucontext_t are the signal mask and * the machine context. The next field is uc_link; we want to * avoid destroying the link when copying out contexts. */ #define UC_COPY_SIZE offsetof(struct ia32_ucontext, uc_link) int freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { bzero(&uc, sizeof(uc)); ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->ucp, UC_COPY_SIZE); } return (ret); } int freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } return (ret == 0 ? EJUSTRETURN : ret); } int freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap) { struct ia32_ucontext uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { bzero(&uc, sizeof(uc)); ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = ia32_set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); } } } } return (ret == 0 ? EJUSTRETURN : ret); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe3 sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct ia32_sigframe3 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); td->td_sigstk.ss_flags |= SS_ONSTACK; } else fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ah = (uintptr_t)catcher; sf.sf_addr = 0; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ah = (uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs; sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp; sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp; sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip; sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)fp; regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe4 sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int oonstack; int sig; td = curthread; p = td->td_proc; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct ia32_sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); } else sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1; PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode - sz_freebsd4_ia32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int oonstack; int sig; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_ia32_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { ia32_osendsig(catcher, ksi, mask); return; } #endif mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); - if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { - xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); - xfpusave = (char *)td->td_md.md_fpu_scratch; - } else { - xfpusave_len = 0; - xfpusave = NULL; - } - /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ - ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); + ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, &xfpusave, &xfpusave_len); fpstate_drop(td); sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; else sp = (char *)regs->tf_rsp; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(sf); /* Align to 16 bytes. */ sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* XXXKIB leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. */ #ifdef COMPAT_43 int ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap) { struct ia32_sigcontext3 sc, *scp; struct trapframe *regs; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_eflags; if (!EFL_SECURE(eflags, regs->tf_rflags)) { return (EINVAL); } if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; regs->tf_gs = scp->sc_gs; regs->tf_rax = scp->sc_eax; regs->tf_rbx = scp->sc_ebx; regs->tf_rcx = scp->sc_ecx; regs->tf_rdx = scp->sc_edx; regs->tf_rsi = scp->sc_esi; regs->tf_rdi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_rbp = scp->sc_ebp; regs->tf_rsp = scp->sc_esp; regs->tf_rip = scp->sc_eip; regs->tf_rflags = eflags; if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, SIGPROCMASK_OLD); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #endif #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_freebsd32_sigreturn(td, uap) struct thread *td; struct freebsd4_freebsd32_sigreturn_args /* { const struct freebsd4_freebsd32_ucontext *sigcntxp; } */ *uap; { struct ia32_ucontext4 uc; struct trapframe *regs; struct ia32_ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_rflags)) { uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; regs->tf_rbx = ucp->uc_mcontext.mc_ebx; regs->tf_rdx = ucp->uc_mcontext.mc_edx; regs->tf_rcx = ucp->uc_mcontext.mc_ecx; regs->tf_rax = ucp->uc_mcontext.mc_eax; regs->tf_trapno = ucp->uc_mcontext.mc_trapno; regs->tf_err = ucp->uc_mcontext.mc_err; regs->tf_rip = ucp->uc_mcontext.mc_eip; regs->tf_cs = cs; regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; regs->tf_ds = ucp->uc_mcontext.mc_ds; regs->tf_es = ucp->uc_mcontext.mc_es; regs->tf_fs = ucp->uc_mcontext.mc_fs; regs->tf_gs = ucp->uc_mcontext.mc_gs; kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int freebsd32_sigreturn(td, uap) struct thread *td; struct freebsd32_sigreturn_args /* { const struct freebsd32_ucontext *sigcntxp; } */ *uap; { struct ia32_ucontext uc; struct trapframe *regs; struct ia32_ucontext *ucp; char *xfpustate; size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_rflags)) { uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((ucp->uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", td->td_proc->p_pid, td->td_name, xfpustate_len); return (EINVAL); } - xfpustate = (char *)td->td_md.md_fpu_scratch; + xfpustate = (char *)fpu_save_area_alloc(); error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate), xfpustate, xfpustate_len); if (error != 0) { + fpu_save_area_free((struct savefpu *)xfpustate); uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", td->td_proc->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = ia32_set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); + fpu_save_area_free((struct savefpu *)xfpustate); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", td->td_proc->p_pid, td->td_name, ret); return (ret); } regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; regs->tf_rbx = ucp->uc_mcontext.mc_ebx; regs->tf_rdx = ucp->uc_mcontext.mc_edx; regs->tf_rcx = ucp->uc_mcontext.mc_ecx; regs->tf_rax = ucp->uc_mcontext.mc_eax; regs->tf_trapno = ucp->uc_mcontext.mc_trapno; regs->tf_err = ucp->uc_mcontext.mc_err; regs->tf_rip = ucp->uc_mcontext.mc_eip; regs->tf_cs = cs; regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; regs->tf_ds = ucp->uc_mcontext.mc_ds; regs->tf_es = ucp->uc_mcontext.mc_es; regs->tf_fs = ucp->uc_mcontext.mc_fs; regs->tf_gs = ucp->uc_mcontext.mc_gs; regs->tf_flags = TF_HASSEGS; kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); } /* * Clear registers on exec */ void ia32_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); #ifdef COMPAT_43 setup_lcall_gate(); #endif pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucode32sel; regs->tf_rbx = (register_t)imgp->ps_strings; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; x86_clear_dbregs(pcb); fpstate_drop(td); /* Return via doreti so that we can change to a different %cs */ set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET); } diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index bd07f70f8d44..94ac69d31e65 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -1,127 +1,126 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)proc.h 7.1 (Berkeley) 5/15/91 * $FreeBSD$ */ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ #include #include #include /* * List of locks * c - proc lock * k - only accessed by curthread * pp - pmap.c:invl_gen_mtx */ struct proc_ldt { caddr_t ldt_base; int ldt_refcnt; }; #define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL struct pmap_invl_gen { u_long gen; /* (k) */ union { LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ struct { struct pmap_invl_gen *next; u_char saved_pri; }; }; } __aligned(16); /* * Machine-dependent part of the proc structure for AMD64. */ struct mdthread { int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ register_t md_spurflt_addr; /* (k) Spurious page fault address. */ struct pmap_invl_gen md_invl_gen; register_t md_efirt_tmp; /* (k) */ int md_efirt_dis_pf; /* (k) */ struct pcb md_pcb; vm_offset_t md_stack_base; struct savefpu *md_usr_fpu_save; - struct savefpu *md_fpu_scratch; }; struct mdproc { struct proc_ldt *md_ldt; /* (t) per-process ldt */ struct system_segment_descriptor md_ldt_sd; u_int md_flags; /* (c) md process flags P_MD */ }; #define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */ #define P_MD_LA48 0x00000002 /* Request LA48 after exec */ #define P_MD_LA57 0x00000004 /* Request LA57 after exec */ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 768 struct syscall_args { u_int code; u_int original_code; struct sysent *callp; register_t args[8]; }; #ifdef _KERNEL /* Get the current kernel thread stack usage. */ #define GET_STACK_USAGE(total, used) do { \ struct thread *td = curthread; \ (total) = td->td_kstack_pages * PAGE_SIZE; \ (used) = (char *)td->td_kstack + \ td->td_kstack_pages * PAGE_SIZE - \ (char *)&td; \ } while (0) struct proc_ldt *user_ldt_alloc(struct proc *, int); void user_ldt_free(struct thread *); struct sysarch_args; int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space); int amd64_set_ldt_data(struct thread *td, int start, int num, struct user_segment_descriptor *descs); extern struct mtx dt_lock; extern int max_ldt_segment; #define NARGREGS 6 #endif /* _KERNEL */ #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 62f939406374..65c5cc65c87e 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1,1768 +1,1768 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0x108, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x110, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x4a8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6c0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xc4, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3b8, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3d0, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x9c, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x308, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x34c, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x6c, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x78, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x268, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x27c, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x308, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; struct thread_domain_data { struct thread *tdd_zombies; int tdd_reapticks; } __aligned(CACHE_LINE_SIZE); static struct thread_domain_data thread_domain_data[MAXMEMDOM]; static struct task thread_reap_task; static struct callout thread_reap_callout; static void thread_zombie(struct thread *); static void thread_reap(void); static void thread_reap_all(void); static void thread_reap_task_cb(void *, int); static void thread_reap_callout_cb(void *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); static void thread_free_batched(struct thread *td); static __exclusive_cache_line struct mtx tid_lock; static bitstr_t *tid_bitmap; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); static int maxthread; SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN, &maxthread, 0, "Maximum number of threads"); static __exclusive_cache_line int nthreads; static LIST_HEAD(tidhashhead, thread) *tidhashtbl; static u_long tidhash; static u_long tidhashlock; static struct rwlock *tidhashtbl_lock; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) #define TIDHASHLOCK(tid) (&tidhashtbl_lock[(tid) & tidhashlock]) EVENTHANDLER_LIST_DEFINE(thread_ctor); EVENTHANDLER_LIST_DEFINE(thread_dtor); EVENTHANDLER_LIST_DEFINE(thread_init); EVENTHANDLER_LIST_DEFINE(thread_fini); static bool thread_count_inc_try(void) { int nthreads_new; nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1; if (nthreads_new >= maxthread - 100) { if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 || nthreads_new >= maxthread) { atomic_subtract_int(&nthreads, 1); return (false); } } return (true); } static bool thread_count_inc(void) { static struct timeval lastfail; static int curfail; thread_reap(); if (thread_count_inc_try()) { return (true); } thread_reap_all(); if (thread_count_inc_try()) { return (true); } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxthread limit exceeded by uid %u " "(pid %d); consider increasing kern.maxthread\n", curthread->td_ucred->cr_ruid, curproc->p_pid); } return (false); } static void thread_count_sub(int n) { atomic_subtract_int(&nthreads, n); } static void thread_count_dec(void) { thread_count_sub(1); } static lwpid_t tid_alloc(void) { static lwpid_t trytid; lwpid_t tid; mtx_lock(&tid_lock); /* * It is an invariant that the bitmap is big enough to hold maxthread * IDs. If we got to this point there has to be at least one free. */ if (trytid >= maxthread) trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); if (tid == -1) { KASSERT(trytid != 0, ("unexpectedly ran out of IDs")); trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); KASSERT(tid != -1, ("unexpectedly ran out of IDs")); } bit_set(tid_bitmap, tid); trytid = tid + 1; mtx_unlock(&tid_lock); return (tid + NO_PID); } static void tid_free_locked(lwpid_t rtid) { lwpid_t tid; mtx_assert(&tid_lock, MA_OWNED); KASSERT(rtid >= NO_PID, ("%s: invalid tid %d\n", __func__, rtid)); tid = rtid - NO_PID; KASSERT(bit_test(tid_bitmap, tid) != 0, ("thread ID %d not allocated\n", rtid)); bit_clear(tid_bitmap, tid); } static void tid_free(lwpid_t rtid) { mtx_lock(&tid_lock); tid_free_locked(rtid); mtx_unlock(&tid_lock); } static void tid_free_batch(lwpid_t *batch, int n) { int i; mtx_lock(&tid_lock); for (i = 0; i < n; i++) { tid_free_locked(batch[i]); } mtx_unlock(&tid_lock); } /* * Batching for thread reapping. */ struct tidbatch { lwpid_t tab[16]; int n; }; static void tidbatch_prep(struct tidbatch *tb) { tb->n = 0; } static void tidbatch_add(struct tidbatch *tb, struct thread *td) { KASSERT(tb->n < nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); tb->tab[tb->n] = td->td_tid; tb->n++; } static void tidbatch_process(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n == nitems(tb->tab)) { tid_free_batch(tb->tab, tb->n); tb->n = 0; } } static void tidbatch_final(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n != 0) { tid_free_batch(tb->tab, tb->n); } } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; TD_SET_STATE(td, TDS_INACTIVE); td->td_lastcpu = td->td_oncpu = NOCPU; /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; #ifdef AUDIT audit_thread_alloc(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_ctor(td); #endif umtx_thread_alloc(td); MPASS(td->td_sel == NULL); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (TD_GET_STATE(td)) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_dtor(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); seltdfini(td); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_allocdomain = vm_phys_domain(vtophys(td)); td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_DIRECT_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_DIRECT_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); MPASS(td->td_sel == NULL); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } extern int max_threads_per_proc; /* * Initialize global thread allocation resources. */ void threadinit(void) { u_long i; lwpid_t tid0; uint32_t flags; /* * Place an upper limit on threads which can be allocated. * * Note that other factors may make the de facto limit much lower. * * Platform limits are somewhat arbitrary but deemed "more than good * enough" for the foreseable future. */ if (maxthread == 0) { #ifdef _LP64 maxthread = MIN(maxproc * max_threads_per_proc, 1000000); #else maxthread = MIN(maxproc * max_threads_per_proc, 100000); #endif } mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK); /* * Handle thread0. */ thread_count_inc(); tid0 = tid_alloc(); if (tid0 != THREAD0_TID) panic("tid0 %d != %d\n", tid0, THREAD0_TID); flags = UMA_ZONE_NOFREE; #ifdef __aarch64__ /* * Force thread structures to be allocated from the direct map. * Otherwise, superpage promotions and demotions may temporarily * invalidate thread structure mappings. For most dynamically allocated * structures this is not a problem, but translation faults cannot be * handled without accessing curthread. */ flags |= UMA_ZONE_CONTIG; #endif thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, flags); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); tidhashlock = (tidhash + 1) / 64; if (tidhashlock > 0) tidhashlock--; tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1), M_TIDHASH, M_WAITOK | M_ZERO); for (i = 0; i < tidhashlock + 1; i++) rw_init(&tidhashtbl_lock[i], "tidhash"); TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL); callout_init(&thread_reap_callout, 1); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Place an unused thread on the zombie list. */ void thread_zombie(struct thread *td) { struct thread_domain_data *tdd; struct thread *ztd; tdd = &thread_domain_data[td->td_allocdomain]; ztd = atomic_load_ptr(&tdd->tdd_zombies); for (;;) { td->td_zombie = ztd; if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t *)&ztd, (uintptr_t)td)) break; continue; } } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombies from passed domain. */ static void thread_reap_domain(struct thread_domain_data *tdd) { struct thread *itd, *ntd; struct tidbatch tidbatch; struct credbatch credbatch; int tdcount; struct plimit *lim; int limcount; /* * Reading upfront is pessimal if followed by concurrent atomic_swap, * but most of the time the list is empty. */ if (tdd->tdd_zombies == NULL) return; itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t)NULL); if (itd == NULL) return; /* * Multiple CPUs can get here, the race is fine as ticks is only * advisory. */ tdd->tdd_reapticks = ticks; tidbatch_prep(&tidbatch); credbatch_prep(&credbatch); tdcount = 0; lim = NULL; limcount = 0; while (itd != NULL) { ntd = itd->td_zombie; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd); tidbatch_add(&tidbatch, itd); credbatch_add(&credbatch, itd); MPASS(itd->td_limit != NULL); if (lim != itd->td_limit) { if (limcount != 0) { lim_freen(lim, limcount); limcount = 0; } } lim = itd->td_limit; limcount++; thread_free_batched(itd); tidbatch_process(&tidbatch); credbatch_process(&credbatch); tdcount++; if (tdcount == 32) { thread_count_sub(tdcount); tdcount = 0; } itd = ntd; } tidbatch_final(&tidbatch); credbatch_final(&credbatch); if (tdcount != 0) { thread_count_sub(tdcount); } MPASS(limcount != 0); lim_freen(lim, limcount); } /* * Reap zombies from all domains. */ static void thread_reap_all(void) { struct thread_domain_data *tdd; int i, domain; domain = PCPU_GET(domain); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[(i + domain) % vm_ndomains]; thread_reap_domain(tdd); } } /* * Reap zombies from local domain. */ static void thread_reap(void) { struct thread_domain_data *tdd; int domain; domain = PCPU_GET(domain); tdd = &thread_domain_data[domain]; thread_reap_domain(tdd); } static void thread_reap_task_cb(void *arg __unused, int pending __unused) { thread_reap_all(); } static void thread_reap_callout_cb(void *arg __unused) { struct thread_domain_data *tdd; int i, cticks, lticks; bool wantreap; wantreap = false; cticks = atomic_load_int(&ticks); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[i]; lticks = tdd->tdd_reapticks; if (tdd->tdd_zombies != NULL && (u_int)(cticks - lticks) > 5 * hz) { wantreap = true; break; } } if (wantreap) taskqueue_enqueue(taskqueue_thread, &thread_reap_task); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Calling this function guarantees that any thread that exited before * the call is reaped when the function returns. By 'exited' we mean * a thread removed from the process linkage with thread_unlink(). * Practically this means that caller must lock/unlock corresponding * process lock before the call, to synchronize with thread_exit(). */ void thread_reap_barrier(void) { struct task *t; /* * First do context switches to each CPU to ensure that all * PCPU pc_deadthreads are moved to zombie list. */ quiesce_all_cpus("", PDROP); /* * Second, fire the task in the same thread as normal * thread_reap() is done, to serialize reaping. */ t = malloc(sizeof(*t), M_TEMP, M_WAITOK); TASK_INIT(t, 0, thread_reap_task_cb, t); taskqueue_enqueue(taskqueue_thread, t); taskqueue_drain(taskqueue_thread, t); free(t, M_TEMP); } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; lwpid_t tid; if (!thread_count_inc()) { return (NULL); } tid = tid_alloc(); td = uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); tid_free(tid); thread_count_dec(); return (NULL); } td->td_tid = tid; bzero(&td->td_sa.args, sizeof(td->td_sa.args)); kmsan_thread_alloc(td); cpu_thread_alloc(td); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ static void thread_free_batched(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); callout_drain(&td->td_slpcallout); /* * Freeing handled by the caller. */ td->td_tid = -1; kmsan_thread_free(td); uma_zfree(thread_zone, td); } void thread_free(struct thread *td) { lwpid_t tid; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td); tid = td->td_tid; thread_free_batched(td); tid_free(tid); thread_count_dec(); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_realucred = crcowget(p->p_ucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { MPASS(td->td_realucred == td->td_ucred); newtd->td_realucred = crcowget(td->td_realucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_realucred != NULL) crcowfree(td); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldlimit = NULL; PROC_LOCK(p); oldcred = crcowsync(); if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); MPASS(td->td_realucred == td->td_ucred); /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) { PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL); } else if (PMC_SYSTEM_SAMPLING_ACTIVE()) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg_locked(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); TD_SET_STATE(td, TDS_INACTIVE); #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ TD_SET_STATE(td, TDS_INACTIVE); td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); #ifdef EPOCH_TRACE SLIST_INIT(&td->td_epochs); #endif sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef EPOCH_TRACE MPASS(SLIST_EMPTY(&td->td_epochs)); #endif TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; /* * Since the thread lock is dropped by the scheduler we have * to retry to check for races. */ restart: switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) { wakeup_swapper |= thread_unsuspend_one(td2, p, true); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, EINTR); return (wakeup_swapper); } break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } } break; default: break; } thread_unlock(td2); return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); thread_unlock(td2); #endif } else thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND); PROC_LOCK(p); } return (0); } /* * Check for possible stops and suspensions while executing a * casueword or similar transiently failing operation. * * The sleep argument controls whether the function can handle a stop * request itself or it should return ERESTART and the request is * proceed at the kernel/user boundary in ast. * * Typically, when retrying due to casueword(9) failure (rv == 1), we * should handle the stop requests there, with exception of cases when * the thread owns a kernel resource, for instance busied the umtx * key, or when functions return immediately if thread_check_susp() * returned non-zero. On the other hand, retrying the whole lock * operation, we better not stop there but delegate the handling to * ast. * * If the request is for thread termination P_SINGLE_EXIT, we cannot * handle it at all, and simply return EINTR. */ int thread_check_susp(struct thread *td, bool sleep) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) error = sleep ? thread_suspend_check(0) : ERESTART; PROC_UNLOCK(p); return (error); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td, 0)); } void thread_run_flash(struct thread *td) { struct proc *p; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (TD_ON_SLEEPQ(td)) sleepq_remove_nested(td); else thread_lock(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); PROC_SLOCK(p); MPASS(p->p_suspcount > 0); p->p_suspcount--; PROC_SUNLOCK(p); if (setrunnable(td, 0)) kick_proc0(); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } else thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } else thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } /* * Locate a thread by number and return with proc lock held. * * thread exit establishes proc -> tidhash lock ordering, but lookup * takes tidhash first and needs to return locked proc. * * The problem is worked around by relying on type-safety of both * structures and doing the work in 2 steps: * - tidhash-locked lookup which saves both thread and proc pointers * - proc-locked verification that the found thread still matches */ static bool tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp) { #define RUN_THRESH 16 struct proc *p; struct thread *td; int run; bool locked; run = 0; rw_rlock(TIDHASHLOCK(tid)); locked = true; LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid != tid) { run++; continue; } p = td->td_proc; if (pid != -1 && p->p_pid != pid) { td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(TIDHASHLOCK(tid))) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(tid)); locked = false; break; } } break; } if (locked) rw_runlock(TIDHASHLOCK(tid)); if (td == NULL) return (false); *pp = p; *tdp = td; return (true); } struct thread * tdfind(lwpid_t tid, pid_t pid) { struct proc *p; struct thread *td; td = curthread; if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) return (NULL); PROC_LOCK(td->td_proc); return (td); } for (;;) { if (!tdfind_hash(tid, pid, &p, &td)) return (NULL); PROC_LOCK(p); if (td->td_tid != tid) { PROC_UNLOCK(p); continue; } if (td->td_proc != p) { PROC_UNLOCK(p); continue; } if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); return (NULL); } return (td); } } void tidhash_add(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); } void tidhash_remove(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_REMOVE(td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); }