diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c index ad272676083e..2a8d2f892807 100644 --- a/sys/kern/kern_acct.c +++ b/sys/kern/kern_acct.c @@ -1,645 +1,645 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1994 Christopher G. Demetriou * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(sizeof(struct acctv3) - offsetof(struct acctv3, ac_trailer) == sizeof(struct acctv2) - offsetof(struct acctv2, ac_trailer), "trailer"); _Static_assert(sizeof(struct acctv3) - offsetof(struct acctv3, ac_len2) == sizeof(struct acctv2) - offsetof(struct acctv2, ac_len2), "len2"); /* * The routines implemented in this file are described in: * Leffler, et al.: The Design and Implementation of the 4.3BSD * UNIX Operating System (Addison Welley, 1989) * on pages 62-63. * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction * compt_t representation described in the above reference was replaced * with that of IEEE-754 floats. * * Arguably, to simplify accounting operations, this mechanism should * be replaced by one in which an accounting log file (similar to /dev/klog) * is read by a user process, etc. However, that has its own problems. */ /* Floating point definitions from . */ #define FLT_MANT_DIG 24 /* p */ #define FLT_MAX_EXP 128 /* emax */ /* * Internal accounting functions. * The former's operation is described in Leffler, et al., and the latter * was provided by UCB with the 4.4BSD-Lite release */ static uint32_t encode_timeval(struct timeval); static uint32_t encode_long(long); static void acctwatch(void); static void acct_thread(void *); static int acct_disable(struct thread *, int); /* * Accounting vnode pointer, saved vnode pointer, and flags for each. * acct_sx protects against changes to the active vnode and credentials * while accounting records are being committed to disk. */ static int acct_configured; static int acct_suspended; static struct vnode *acct_vp; static struct ucred *acct_cred; static int acct_flags; static struct sx acct_sx; SX_SYSINIT(acct, &acct_sx, "acct_sx"); /* * State of the accounting kthread. */ static int acct_state; #define ACCT_RUNNING 1 /* Accounting kthread is running. */ #define ACCT_EXITREQ 2 /* Accounting kthread should exit. */ /* * Values associated with enabling and disabling accounting */ static int acctsuspend = 2; /* stop accounting when < 2% free space left */ SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, &acctsuspend, 0, "percentage of free disk space below which accounting stops"); static int acctresume = 4; /* resume when free space risen to > 4% */ SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, &acctresume, 0, "percentage of free disk space above which accounting resumes"); static int acctchkfreq = 15; /* frequency (in seconds) to check space */ static int sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS) { int error, value; /* Write out the old value. */ error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int)); if (error || req->newptr == NULL) return (error); /* Read in and verify the new value. */ error = SYSCTL_IN(req, &value, sizeof(int)); if (error) return (error); if (value <= 0) return (EINVAL); acctchkfreq = value; return (0); } SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &acctchkfreq, 0, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &acctchkfreq, 0, sysctl_acct_chkfreq, "I", "frequency for checking the free space"); SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0, "Accounting configured or not"); SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0, "Accounting suspended or not"); /* * Accounting system call. Written based on the specification and previous * implementation done by Mark Tinguely. */ int sys_acct(struct thread *td, struct acct_args *uap) { struct nameidata nd; int error, flags, replacing; error = priv_check(td, PRIV_ACCT); if (error) return (error); /* * If accounting is to be started to a file, open that file for * appending and make sure it's a 'normal'. */ if (uap->path != NULL) { NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td); flags = FWRITE | O_APPEND; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC error = mac_system_check_acct(td->td_ucred, nd.ni_vp); if (error) { VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, flags, td->td_ucred, td); return (error); } #endif VOP_UNLOCK(nd.ni_vp); if (nd.ni_vp->v_type != VREG) { vn_close(nd.ni_vp, flags, td->td_ucred, td); return (EACCES); } #ifdef MAC } else { error = mac_system_check_acct(td->td_ucred, NULL); if (error) return (error); #endif } /* * Disallow concurrent access to the accounting vnode while we swap * it out, in order to prevent access after close. */ sx_xlock(&acct_sx); /* * Don't log spurious disable/enable messages if we are * switching from one accounting file to another due to log * rotation. */ replacing = (acct_vp != NULL && uap->path != NULL); /* * If accounting was previously enabled, kill the old space-watcher, * close the file, and (if no new file was specified, leave). Reset * the suspended state regardless of whether accounting remains * enabled. */ acct_suspended = 0; if (acct_vp != NULL) error = acct_disable(td, !replacing); if (uap->path == NULL) { if (acct_state & ACCT_RUNNING) { acct_state |= ACCT_EXITREQ; wakeup(&acct_state); } sx_xunlock(&acct_sx); return (error); } /* * Save the new accounting file vnode, and schedule the new * free space watcher. */ acct_vp = nd.ni_vp; acct_cred = crhold(td->td_ucred); acct_flags = flags; if (acct_state & ACCT_RUNNING) acct_state &= ~ACCT_EXITREQ; else { /* * Try to start up an accounting kthread. We may start more * than one, but if so the extras will commit suicide as * soon as they start up. */ error = kproc_create(acct_thread, NULL, NULL, 0, 0, "accounting"); if (error) { (void) acct_disable(td, 0); sx_xunlock(&acct_sx); log(LOG_NOTICE, "Unable to start accounting thread\n"); return (error); } } acct_configured = 1; sx_xunlock(&acct_sx); if (!replacing) log(LOG_NOTICE, "Accounting enabled\n"); return (error); } /* * Disable currently in-progress accounting by closing the vnode, dropping * our reference to the credential, and clearing the vnode's flags. */ static int acct_disable(struct thread *td, int logging) { int error; sx_assert(&acct_sx, SX_XLOCKED); error = vn_close(acct_vp, acct_flags, acct_cred, td); crfree(acct_cred); acct_configured = 0; acct_vp = NULL; acct_cred = NULL; acct_flags = 0; if (logging) log(LOG_NOTICE, "Accounting disabled\n"); return (error); } /* * Write out process accounting information, on process exit. * Data to be written out is specified in Leffler, et al. * and are enumerated below. (They're also noted in the system * "acct.h" header file.) */ int acct_process(struct thread *td) { struct acctv3 acct; struct timeval ut, st, tmp; struct proc *p; struct rusage ru; int t, ret; /* * Lockless check of accounting condition before doing the hard * work. */ if (acct_vp == NULL || acct_suspended) return (0); memset(&acct, 0, sizeof(acct)); sx_slock(&acct_sx); /* * If accounting isn't enabled, don't bother. Have to check again * once we own the lock in case we raced with disabling of accounting * by another thread. */ if (acct_vp == NULL || acct_suspended) { sx_sunlock(&acct_sx); return (0); } p = td->td_proc; td->td_pflags2 |= TDP2_ACCT; /* * Get process accounting information. */ sx_slock(&proctree_lock); PROC_LOCK(p); /* (1) The terminal from which the process was started */ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp); else acct.ac_tty = NODEV; sx_sunlock(&proctree_lock); /* (2) The name of the command that ran */ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); /* (3) The amount of user and system time that was used */ rufetchcalc(p, &ru, &ut, &st); acct.ac_utime = encode_timeval(ut); acct.ac_stime = encode_timeval(st); /* (4) The elapsed time the command ran (and its starting time) */ getboottime(&tmp); timevaladd(&tmp, &p->p_stats->p_start); acct.ac_btime = tmp.tv_sec; microuptime(&tmp); timevalsub(&tmp, &p->p_stats->p_start); acct.ac_etime = encode_timeval(tmp); /* (5) The average amount of memory used */ tmp = ut; timevaladd(&tmp, &st); /* Convert tmp (i.e. u + s) into hz units to match ru_i*. */ t = tmp.tv_sec * hz + tmp.tv_usec / tick; if (t) acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss + + ru.ru_isrss) / t); else acct.ac_mem = 0; /* (6) The number of disk I/O operations done */ acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock); /* (7) The UID and GID of the process */ acct.ac_uid = p->p_ucred->cr_ruid; acct.ac_gid = p->p_ucred->cr_rgid; /* (8) The boolean flags that tell how the process terminated, etc. */ acct.ac_flagx = p->p_acflag; PROC_UNLOCK(p); /* Setup ancillary structure fields. */ acct.ac_flagx |= ANVER; acct.ac_zero = 0; acct.ac_version = 3; acct.ac_len = acct.ac_len2 = sizeof(acct); /* * Write the accounting information to the file. */ ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct), (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED, NULL, td); sx_sunlock(&acct_sx); td->td_pflags2 &= ~TDP2_ACCT; return (ret); } /* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */ /* Convert timevals and longs into IEEE-754 bit patterns. */ /* Mantissa mask (MSB is implied, so subtract 1). */ #define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1) /* * We calculate integer values to a precision of approximately * 28 bits. * This is high-enough precision to fill the 24 float bits * and low-enough to avoid overflowing the 32 int bits. */ #define CALC_BITS 28 /* log_2(1000000). */ #define LOG2_1M 20 /* * Convert the elements of a timeval into a 32-bit word holding * the bits of a IEEE-754 float. * The float value represents the timeval's value in microsecond units. */ static uint32_t encode_timeval(struct timeval tv) { int log2_s; int val, exp; /* Unnormalized value and exponent */ int norm_exp; /* Normalized exponent */ int shift; /* * First calculate value and exponent to about CALC_BITS precision. * Note that the following conditionals have been ordered so that * the most common cases appear first. */ if (tv.tv_sec == 0) { if (tv.tv_usec == 0) return (0); exp = 0; val = tv.tv_usec; } else { /* * Calculate the value to a precision of approximately * CALC_BITS. */ log2_s = fls(tv.tv_sec) - 1; if (log2_s + LOG2_1M < CALC_BITS) { exp = 0; val = 1000000 * tv.tv_sec + tv.tv_usec; } else { exp = log2_s + LOG2_1M - CALC_BITS; val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec + tv.tv_usec) >> exp); } } /* Now normalize and pack the value into an IEEE-754 float. */ norm_exp = fls(val) - 1; shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d exp=%d shift=%d log2(val)=%d\n", val, exp, shift, norm_exp); printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* * Convert a non-negative long value into the bit pattern of * an IEEE-754 float value. */ static uint32_t encode_long(long val) { int norm_exp; /* Normalized exponent */ int shift; if (val == 0) return (0); if (val < 0) { log(LOG_NOTICE, "encode_long: negative value %ld in accounting record\n", val); val = LONG_MAX; } norm_exp = fls(val) - 1; shift = FLT_MANT_DIG - norm_exp - 1; #ifdef ACCT_DEBUG printf("val=%d shift=%d log2(val)=%d\n", val, shift, norm_exp); printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp, ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK)); #endif return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) | ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK)); } /* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */ /* * Periodically check the filesystem to see if accounting * should be turned on or off. Beware the case where the vnode * has been vgone()'d out from underneath us, e.g. when the file * system containing the accounting file has been forcibly unmounted. */ /* ARGSUSED */ static void acctwatch(void) { struct statfs *sp; sx_assert(&acct_sx, SX_XLOCKED); /* * If accounting was disabled before our kthread was scheduled, * then acct_vp might be NULL. If so, just ask our kthread to * exit and return. */ if (acct_vp == NULL) { acct_state |= ACCT_EXITREQ; return; } /* * If our vnode is no longer valid, tear it down and signal the * accounting thread to die. */ if (acct_vp->v_type == VBAD) { (void) acct_disable(NULL, 1); acct_state |= ACCT_EXITREQ; return; } /* * Stopping here is better than continuing, maybe it will be VBAD * next time around. */ sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); if (VFS_STATFS(acct_vp->v_mount, sp) < 0) { free(sp, M_STATFS); return; } if (acct_suspended) { if (sp->f_bavail > (int64_t)(acctresume * sp->f_blocks / 100)) { acct_suspended = 0; log(LOG_NOTICE, "Accounting resumed\n"); } } else { if (sp->f_bavail <= (int64_t)(acctsuspend * sp->f_blocks / 100)) { acct_suspended = 1; log(LOG_NOTICE, "Accounting suspended\n"); } } free(sp, M_STATFS); } /* * The main loop for the dedicated kernel thread that periodically calls * acctwatch(). */ static void acct_thread(void *dummy) { u_char pri; /* This is a low-priority kernel thread. */ pri = PRI_MAX_KERN; thread_lock(curthread); sched_prio(curthread, pri); thread_unlock(curthread); /* If another accounting kthread is already running, just die. */ sx_xlock(&acct_sx); if (acct_state & ACCT_RUNNING) { sx_xunlock(&acct_sx); kproc_exit(0); } acct_state |= ACCT_RUNNING; /* Loop until we are asked to exit. */ while (!(acct_state & ACCT_EXITREQ)) { /* Perform our periodic checks. */ acctwatch(); /* * We check this flag again before sleeping since the * acctwatch() might have shut down accounting and asked us * to exit. */ if (!(acct_state & ACCT_EXITREQ)) { sx_sleep(&acct_state, &acct_sx, 0, "-", acctchkfreq * hz); } } /* * Acknowledge the exit request and shutdown. We clear both the * exit request and running flags. */ acct_state = 0; sx_xunlock(&acct_sx); kproc_exit(0); } diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index d3043d16f4ec..e724a28487d6 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -1,1373 +1,1373 @@ /*- * Copyright (c) 2013-2015 Gleb Smirnoff * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kern_tls.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory"); #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 #define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 /* * Structure describing a single sendfile(2) I/O, which may consist of * several underlying pager I/Os. * * The syscall context allocates the structure and initializes 'nios' * to 1. As sendfile_swapin() runs through pages and starts asynchronous * paging operations, it increments 'nios'. * * Every I/O completion calls sendfile_iodone(), which decrements the 'nios', * and the syscall also calls sendfile_iodone() after allocating all mbufs, * linking them and sending to socket. Whoever reaches zero 'nios' is * responsible to * call pru_ready on the socket, to notify it of readyness * of the data. */ struct sf_io { volatile u_int nios; u_int error; int npages; struct socket *so; struct mbuf *m; vm_object_t obj; vm_pindex_t pindex0; #ifdef KERN_TLS struct ktls_session *tls; #endif vm_page_t pa[]; }; /* * Structure used to track requests with SF_SYNC flag. */ struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; bool waiting; }; static void sendfile_sync_destroy(struct sendfile_sync *sfs) { KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs)); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_SENDFILE); } static void sendfile_sync_signal(struct sendfile_sync *sfs) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs)); if (--sfs->count == 0) { if (!sfs->waiting) { /* The sendfile() waiter was interrupted by a signal. */ sendfile_sync_destroy(sfs); return; } else { cv_signal(&sfs->cv); } } mtx_unlock(&sfs->mtx); } counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, - CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); static void sendfile_free_mext(struct mbuf *m) { struct sf_buf *sf; vm_page_t pg; int flags; KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF, ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m)); sf = m->m_ext.ext_arg1; pg = sf_buf_page(sf); flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0; sf_buf_free(sf); vm_page_release(pg, flags); if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { struct sendfile_sync *sfs = m->m_ext.ext_arg2; sendfile_sync_signal(sfs); } } static void sendfile_free_mext_pg(struct mbuf *m) { vm_page_t pg; int flags, i; bool cache_last; M_ASSERTEXTPG(m); cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST; flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0; for (i = 0; i < m->m_epg_npgs; i++) { if (cache_last && i == m->m_epg_npgs - 1) flags = 0; pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]); vm_page_release(pg, flags); } if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { struct sendfile_sync *sfs = m->m_ext.ext_arg1; sendfile_sync_signal(sfs); } } /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. */ static inline off_t xfsize(int i, int n, off_t off, off_t len) { if (i == 0) return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) return ((off + len) & PAGE_MASK); return (PAGE_SIZE); } /* * Helper function to get offset within object for i page. */ static inline vm_ooffset_t vmoff(int i, off_t off) { if (i == 0) return ((vm_ooffset_t)off); return (trunc_page(off + i * PAGE_SIZE)); } /* * Helper function used when allocation of a page or sf_buf failed. * Pretend as if we don't have enough space, subtract xfsize() of * all pages that failed. */ static inline void fixspace(int old, int new, off_t off, int *space) { KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); /* Subtract last one. */ *space -= xfsize(old - 1, old, off, *space); old--; if (new == old) /* There was only one page. */ return; /* Subtract first one. */ if (new == 0) { *space -= xfsize(0, old, off, *space); new++; } /* Rest of pages are full sized. */ *space -= (old - new) * PAGE_SIZE; KASSERT(*space >= 0, ("%s: space went backwards", __func__)); } /* * Wait for all in-flight ios to complete, we must not unwire pages * under them. */ static void sendfile_iowait(struct sf_io *sfio, const char *wmesg) { while (atomic_load_int(&sfio->nios) != 1) pause(wmesg, 1); } /* * I/O completion callback. */ static void sendfile_iodone(void *arg, vm_page_t *pa, int count, int error) { struct sf_io *sfio = arg; struct socket *so; int i; if (error != 0) sfio->error = error; /* * Restore the valid page pointers. They are already * unbusied, but still wired. * * XXXKIB since pages are only wired, and we do not * own the object lock, other users might have * invalidated them in meantime. Similarly, after we * unbusied the swapped-in pages, they can become * invalid under us. */ MPASS(count == 0 || pa[0] != bogus_page); for (i = 0; i < count; i++) { if (pa[i] == bogus_page) { sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] = pa[i] = vm_page_relookup(sfio->obj, pa[0]->pindex + i); KASSERT(pa[i] != NULL, ("%s: page %p[%d] disappeared", __func__, pa, i)); } else { vm_page_xunbusy_unchecked(pa[i]); } } if (!refcount_release(&sfio->nios)) return; #ifdef INVARIANTS for (i = 1; i < sfio->npages; i++) { if (sfio->pa[i] == NULL) break; KASSERT(vm_page_wired(sfio->pa[i]), ("sfio %p page %d %p not wired", sfio, i, sfio->pa[i])); if (i == 0) continue; KASSERT(sfio->pa[0]->object == sfio->pa[i]->object, ("sfio %p page %d %p wrong owner %p %p", sfio, i, sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object)); KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex, ("sfio %p page %d %p wrong index %jx %jx", sfio, i, sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex, (uintmax_t)sfio->pa[i]->pindex)); } #endif vm_object_pip_wakeup(sfio->obj); if (sfio->m == NULL) { /* * Either I/O operation failed, or we failed to allocate * buffers, or we bailed out on first busy page, or we * succeeded filling the request without any I/Os. Anyway, * pru_send hadn't been executed - nothing had been sent * to the socket yet. */ MPASS((curthread->td_pflags & TDP_KTHREAD) == 0); free(sfio, M_SENDFILE); return; } #if defined(KERN_TLS) && defined(INVARIANTS) if ((sfio->m->m_flags & M_EXTPG) != 0) KASSERT(sfio->tls == sfio->m->m_epg_tls, ("TLS session mismatch")); else KASSERT(sfio->tls == NULL, ("non-ext_pgs mbuf with TLS session")); #endif so = sfio->so; CURVNET_SET(so->so_vnet); if (__predict_false(sfio->error)) { /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear * it down. Protocol abort method would tear down protocol * state, free all ready mbufs and detach not ready ones. * We will free the mbufs corresponding to this I/O manually. * * The socket would be marked with EIO and made available * for read, so that application receives EIO on next * syscall and eventually closes the socket. */ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(sfio->m, sfio->npages); #ifdef KERN_TLS } else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) { /* * I/O operation is complete, but we still need to * encrypt. We cannot do this in the interrupt thread * of the disk controller, so forward the mbufs to a * different thread. * * Donate the socket reference from sfio to rather * than explicitly invoking soref(). */ ktls_enqueue(sfio->m, so, sfio->npages); goto out_with_ref; #endif } else (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); SOCK_LOCK(so); sorele(so); #ifdef KERN_TLS out_with_ref: #endif CURVNET_RESTORE(); free(sfio, M_SENDFILE); } /* * Iterate through pages vector and request paging for non-valid pages. */ static int sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off, off_t len, int rhpages, int flags) { vm_page_t *pa; int a, count, count1, grabbed, i, j, npages, rv; pa = sfio->pa; npages = sfio->npages; *nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; sfio->pindex0 = OFF_TO_IDX(off); /* * First grab all the pages and wire them. Note that we grab * only required pages. Readahead pages are dealt with later. */ grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off), VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages); if (grabbed < npages) { for (int i = grabbed; i < npages; i++) pa[i] = NULL; npages = grabbed; rhpages = 0; } for (i = 0; i < npages;) { /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { vm_page_xunbusy(pa[i]); SFSTAT_INC(sf_pages_valid); i++; continue; } /* * Next page is invalid. Check if it belongs to pager. It * may not be there, which is a regular situation for shmem * pager. For vnode pager this happens only in case of * a sparse file. * * Important feature of vm_pager_has_page() is the hint * stored in 'a', about how many pages we can pagein after * this page in a single I/O. */ VM_OBJECT_RLOCK(obj); if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL, &a)) { VM_OBJECT_RUNLOCK(obj); pmap_zero_page(pa[i]); vm_page_valid(pa[i]); MPASS(pa[i]->dirty == 0); vm_page_xunbusy(pa[i]); i++; continue; } VM_OBJECT_RUNLOCK(obj); /* * We want to pagein as many pages as possible, limited only * by the 'a' hint and actual request. */ count = min(a + 1, npages - i); /* * We should not pagein into a valid page because * there might be still unfinished write tracked by * e.g. a buffer, thus we substitute any valid pages * with the bogus one. * * We must not leave around xbusy pages which are not * part of the run passed to vm_pager_getpages(), * otherwise pager might deadlock waiting for the busy * status of the page, e.g. if it constitues the * buffer needed to validate other page. * * First trim the end of the run consisting of the * valid pages, then replace the rest of the valid * with bogus. */ count1 = count; for (j = i + count - 1; j > i; j--) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { vm_page_xunbusy(pa[j]); SFSTAT_INC(sf_pages_valid); count--; } else { break; } } /* * The last page in the run pa[i + count - 1] is * guaranteed to be invalid by the trim above, so it * is not replaced with bogus, thus -1 in the loop end * condition. */ MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL); for (j = i + 1; j < i + count - 1; j++) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { vm_page_xunbusy(pa[j]); SFSTAT_INC(sf_pages_valid); SFSTAT_INC(sf_pages_bogus); pa[j] = bogus_page; } } refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); if (__predict_false(rv != VM_PAGER_OK)) { sendfile_iowait(sfio, "sferrio"); /* * Do remaining pages recovery before returning EIO. * Pages from 0 to npages are wired. * Pages from (i + count1) to npages are busied. */ for (j = 0; j < npages; j++) { if (j >= i + count1) vm_page_xunbusy(pa[j]); KASSERT(pa[j] != NULL && pa[j] != bogus_page, ("%s: page %p[%d] I/O recovery failure", __func__, pa, j)); vm_page_unwire(pa[j], PQ_INACTIVE); pa[j] = NULL; } return (EIO); } SFSTAT_INC(sf_iocnt); SFSTAT_ADD(sf_pages_read, count); if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); i += count1; (*nios)++; } if (*nios == 0 && npages != 0) SFSTAT_INC(sf_noiocnt); return (0); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; error = 0; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } /* * Use the pager size when available to simplify synchronization * with filesystems, which otherwise must atomically update both * the vnode pager size and file size. */ if (obj->type == OBJT_VNODE) { VM_OBJECT_RLOCK(obj); *obj_size = obj->un_pager.vnp.vnp_size; } else { error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; VM_OBJECT_RLOCK(obj); } } else if (fp->f_type == DTYPE_SHM) { shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_RLOCK(obj); *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_RUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_RUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp); return (error); } static int sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, &cap_send_rights, sock_fp, NULL, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); /* * SCTP one-to-one style sockets currently don't work with * sendfile(). So indicate EINVAL for now. */ if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP) return (EINVAL); if (SOLISTENING(*so)) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; vm_page_t pga; struct socket *so; #ifdef KERN_TLS struct ktls_session *tls; #endif struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size, nobj_size; int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr; #ifdef KERN_TLS int tls_enq_cnt; #endif bool use_ext_pgs; obj = NULL; so = NULL; m = mh = NULL; sfs = NULL; #ifdef KERN_TLS tls = NULL; #endif hdrlen = sbytes = 0; softerr = 0; use_ext_pgs = false; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif SFSTAT_INC(sf_syscalls); SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); sfs->waiting = true; } rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ error = SOCK_IO_SEND_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) goto out; #ifdef KERN_TLS tls = ktls_hold(so->so_snd.sb_tls_info); #endif /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; rem > 0; ) { struct sf_io *sfio; vm_page_t *pa; struct mbuf *m0, *mtail; int nios, space, npages, rhpages; mtail = NULL; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in * the socket buffer by the size of the header mbuf chain. * Clear hdr_uio here and hdrlen at the end of the first loop. */ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; #ifdef KERN_TLS if (tls != NULL) mh = m_uiotombuf(hdr_uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG); else #endif mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0); hdrlen = m_length(mh, &mhtail); space -= hdrlen; /* * If header consumed all the socket buffer space, * don't waste CPU cycles and jump to the end. */ if (space == 0) { sfio = NULL; nios = 0; goto prepend_header; } hdr_uio = NULL; } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; /* * Check to see if the file size has changed. */ if (obj->type == OBJT_VNODE) { VM_OBJECT_RLOCK(obj); nobj_size = obj->un_pager.vnp.vnp_size; VM_OBJECT_RUNLOCK(obj); } else { error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) { VOP_UNLOCK(vp); goto done; } nobj_size = va.va_size; } if (off >= nobj_size) { VOP_UNLOCK(vp); goto done; } if (nobj_size != obj_size) { obj_size = nobj_size; rem = nbytes ? omin(nbytes + offset, obj_size) : obj_size; rem -= off; } } if (space > rem) space = rem; else if (space > PAGE_SIZE) { /* * Use page boundaries when possible for large * requests. */ if (off & PAGE_MASK) space -= (PAGE_SIZE - (off & PAGE_MASK)); space = trunc_page(space); if (off & PAGE_MASK) space += (PAGE_SIZE - (off & PAGE_MASK)); } npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); /* * Calculate maximum allowed number of pages for readahead * at this iteration. If SF_USER_READAHEAD was set, we don't * do any heuristics and use exactly the value supplied by * application. Otherwise, we allow readahead up to "rem". * If application wants more, let it be, but there is no * reason to go above maxphys. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ if (flags & SF_USER_READAHEAD) { rhpages = SF_READAHEAD(flags); } else { rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; rhpages += SF_READAHEAD(flags); } rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); sfio = malloc(sizeof(struct sf_io) + npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->obj = obj; sfio->error = 0; sfio->m = NULL; sfio->npages = npages; #ifdef KERN_TLS /* * This doesn't use ktls_hold() because sfio->m will * also have a reference on 'tls' that will be valid * for all of sfio's lifetime. */ sfio->tls = tls; #endif vm_object_pip_add(obj, 1); error = sendfile_swapin(obj, sfio, &nios, off, space, rhpages, flags); if (error != 0) { if (vp != NULL) VOP_UNLOCK(vp); sendfile_iodone(sfio, NULL, 0, error); goto done; } /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ pa = sfio->pa; /* * Use unmapped mbufs if enabled for TCP. Unmapped * bufs are restricted to TCP as that is what has been * tested. In particular, unmapped mbufs have not * been tested with UNIX-domain sockets. * * TLS frames always require unmapped mbufs. */ if ((mb_use_ext_pgs && so->so_proto->pr_protocol == IPPROTO_TCP) #ifdef KERN_TLS || tls != NULL #endif ) { use_ext_pgs = true; #ifdef KERN_TLS if (tls != NULL) max_pgs = num_pages(tls->params.max_frame_len); else #endif max_pgs = MBUF_PEXT_MAX_PGS; /* Start at last index, to wrap on first use. */ ext_pgs_idx = max_pgs - 1; } for (int i = 0; i < npages; i++) { /* * If a page wasn't grabbed successfully, then * trim the array. Can happen only with SF_NODISKIO. */ if (pa[i] == NULL) { SFSTAT_INC(sf_busy); fixspace(npages, i, off, &space); sfio->npages = i; softerr = EBUSY; break; } pga = pa[i]; if (pga == bogus_page) pga = vm_page_relookup(obj, sfio->pindex0 + i); if (use_ext_pgs) { off_t xfs; ext_pgs_idx++; if (ext_pgs_idx == max_pgs) { m0 = mb_alloc_ext_pgs(M_WAITOK, sendfile_free_mext_pg); if (flags & SF_NOCACHE) { m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE; /* * See comment below regarding * ignoring SF_NOCACHE for the * last page. */ if ((npages - i <= max_pgs) && ((off + space) & PAGE_MASK) && (rem > space || rhpages > 0)) m0->m_ext.ext_flags |= EXT_FLAG_CACHE_LAST; } if (sfs != NULL) { m0->m_ext.ext_flags |= EXT_FLAG_SYNC; m0->m_ext.ext_arg1 = sfs; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } ext_pgs_idx = 0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; m0->m_epg_1st_off = vmoff(i, off) & PAGE_MASK; } if (nios) { mtail->m_flags |= M_NOTREADY; m0->m_epg_nrdy++; } m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga); m0->m_epg_npgs++; xfs = xfsize(i, npages, off, space); m0->m_epg_last_len = xfs; MBUF_EXT_PGS_ASSERT_SANITY(m0); mtail->m_len += xfs; mtail->m_ext.ext_size += PAGE_SIZE; continue; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pga, m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); sendfile_iowait(sfio, "sfnosf"); for (int j = i; j < npages; j++) { vm_page_unwire(pa[j], PQ_INACTIVE); pa[j] = NULL; } if (m == NULL) softerr = ENOBUFS; fixspace(npages, i, off, &space); sfio->npages = i; break; } m0 = m_get(M_WAITOK, MT_DATA); m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_type = EXT_SFBUF; m0->m_ext.ext_flags = EXT_FLAG_EMBREF; m0->m_ext.ext_free = sendfile_free_mext; /* * SF_NOCACHE sets the page as being freed upon send. * However, we ignore it for the last page in 'space', * if the page is truncated, and we got more data to * send (rem > space), or if we have readahead * configured (rhpages > 0). */ if ((flags & SF_NOCACHE) && (i != npages - 1 || !((off + space) & PAGE_MASK) || !(rem > space || rhpages > 0))) m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE; if (sfs != NULL) { m0->m_ext.ext_flags |= EXT_FLAG_SYNC; m0->m_ext.ext_arg2 = sfs; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } m0->m_ext.ext_count = 1; m0->m_flags |= (M_EXT | M_RDONLY); if (nios) m0->m_flags |= M_NOTREADY; m0->m_data = (char *)sf_buf_kva(sf) + (vmoff(i, off) & PAGE_MASK); m0->m_len = xfsize(i, npages, off, space); /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; } if (vp != NULL) VOP_UNLOCK(vp); /* Keep track of bytes processed. */ off += space; rem -= space; /* * Prepend header, if any. Save pointer to first mbuf * with a page. */ if (hdrlen) { prepend_header: m0 = mhtail->m_next = m; m = mh; mh = NULL; } else m0 = m; if (m == NULL) { KASSERT(softerr, ("%s: m NULL, no error", __func__)); error = softerr; sendfile_iodone(sfio, NULL, 0, 0); goto done; } /* Add the buffer chain to the socket buffer. */ KASSERT(m_length(m, NULL) == space + hdrlen, ("%s: mlen %u space %d hdrlen %d", __func__, m_length(m, NULL), space, hdrlen)); CURVNET_SET(so->so_vnet); #ifdef KERN_TLS if (tls != NULL) ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP); #endif if (nios == 0) { /* * If sendfile_swapin() didn't initiate any I/Os, * which happens if all data is cached in VM, or if * the header consumed all socket buffer space and * sfio is NULL, then we can send data right now * without the PRUS_NOTREADY flag. */ if (sfio != NULL) sendfile_iodone(sfio, NULL, 0, 0); #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); if (error != 0) { m_freem(m); } else { soref(so); ktls_enqueue(m, so, tls_enq_cnt); } } else #endif error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); } else { sfio->so = so; sfio->m = m0; soref(so); error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, error); } CURVNET_RESTORE(); m = NULL; if (error) goto done; sbytes += space + hdrlen; if (hdrlen) hdrlen = 0; if (softerr) { error = softerr; goto done; } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { SOCK_IO_SEND_UNLOCK(so); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: SOCK_IO_SEND_UNLOCK(so); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (mh) m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) error = cv_wait_sig(&sfs->cv, &sfs->mtx); if (sfs->count == 0) { sendfile_sync_destroy(sfs); } else { sfs->waiting = false; mtx_unlock(&sfs->mtx); } } #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (error == ERESTART) error = EINTR; return (error); } static int sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); sbytes = 0; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; #ifdef COMPAT_FREEBSD4 /* * In FreeBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } #endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0) goto out; error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (sendfile(td, uap, 0)); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 0c5e9f41f153..6f656ba92107 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1,4409 +1,4409 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static void reschedule_signals(struct proc *p, sigset_t block, int flags); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); __read_frequently bool sigfastblock_fetch_always = false; SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN, &sigfastblock_fetch_always, 0, "Fetch sigfastblock word on each syscall entry for proper " "blocking semantic"); static bool kern_sig_discard_ign = true; SYSCTL_BOOL(_kern, OID_AUTO, sig_discard_ign, CTLFLAG_RWTUN, &kern_sig_discard_ign, 0, "Discard ignored signals on delivery, otherwise queue them to " "the target queue"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; #define _SIG_FOREACH_ADVANCE(i, set) ({ \ int __found; \ for (;;) { \ if (__bits != 0) { \ int __sig = ffs(__bits); \ __bits &= ~(1u << (__sig - 1)); \ sig = __i * sizeof((set)->__bits[0]) * NBBY + __sig; \ __found = 1; \ break; \ } \ if (++__i == _SIG_WORDS) { \ __found = 0; \ break; \ } \ __bits = (set)->__bits[__i]; \ } \ __found != 0; \ }) #define SIG_FOREACH(i, set) \ for (int32_t __i = -1, __bits = 0; \ _SIG_FOREACH_ADVANCE(i, set); ) \ sigset_t fastblock_mask; static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); SIGFILLSET(fastblock_mask); SIG_CANTMASK(fastblock_mask); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig_drop_caught(p); /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; SIG_FOREACH(sig, &osigignore) { if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { /* * sigwait() function shall not return EINTR, but * the syscall does. Non-ancient libc provides the * wrapper which hides EINTR. Otherwise, EINTR return * is used by libthr to handle required cancellation * point in the sigwait(). */ if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) return (ERESTART); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; bool traced; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; traced = false; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); timespecadd(&rts, timeout, &ets); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); if ((p->p_sysent->sv_flags & SV_SIG_DISCIGN) != 0 || !kern_sig_discard_ign) { thread_lock(td); td->td_flags |= TDF_SIGWAIT; thread_unlock(td); } for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } timespecsub(&ets, &rts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } if (traced) { error = EINTR; break; } error = msleep(&p->p_sigacts, &p->p_mtx, PPAUSE | PCATCH, "sigwait", timo); /* The syscalls can not be restarted. */ if (error == ERESTART) error = EINTR; /* We will calculate timeout by ourself. */ if (timeout != NULL && error == EAGAIN) error = 0; /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR after wait was done. Only do this as last * resort after rechecking for possible queued signals * and expired timeouts. */ if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0) traced = true; } thread_lock(td); td->td_flags &= ~TDF_SIGWAIT; thread_unlock(td); new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR. */ if ((p->p_ptevents & PTRACE_SYSCALL) != 0) has_sig += 1; } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } struct killpg1_ctx { struct thread *td; ksiginfo_t *ksi; int sig; bool sent; bool found; int ret; }; static void killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg) { int err; if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW) return; PROC_LOCK(p); err = p_cansignal(arg->td, p, arg->sig); if (err == 0 && arg->sig != 0) pksignal(p, arg->sig, arg->ksi); PROC_UNLOCK(p); if (err != ESRCH) arg->found = true; if (err == 0) arg->sent = true; else if (arg->ret == 0 && err != ESRCH && err != EPERM) arg->ret = err; } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; struct killpg1_ctx arg; arg.td = td; arg.ksi = ksi; arg.sig = sig; arg.sent = false; arg.found = false; arg.ret = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { killpg1_sendsig(p, true, &arg); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { killpg1_sendsig(p, false, &arg); } PGRP_UNLOCK(pgrp); } MPASS(arg.ret != 0 || arg.found || !arg.sent); if (arg.ret == 0 && !arg.sent) arg.ret = arg.found ? EPERM : ESRCH; return (arg.ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { return (kern_kill(td, uap->pid, uap->signum)); } int kern_kill(struct thread *td, pid_t pid, int signum) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(signum); AUDIT_ARG_PID(pid); if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (pid > 0) { /* kill single process */ if ((p = pfind_any(pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, signum); if (error == 0 && signum) pksignal(p, signum, &ksi); PROC_UNLOCK(p); return (error); } switch (pid) { case -1: /* broadcast signal */ return (killpg1(td, signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, signum, -pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; sigset_t sigmask; int code, sig; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sigmask = td->td_sigmask; if (td->td_sigblock_val != 0) SIGSETOR(sigmask, fastblock_mask); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; td->td_pflags &= ~TDP_SIGFASTBLOCK; td->td_sigblock_val = 0; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, bool fast_sigblock) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(!fast_sigblock || p == curproc); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) && (!fast_sigblock || curthread->td_sigblock_val == 0)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock || td != curthread || td->td_sigblock_val == 0)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } static int sig_sleepq_abort(struct thread *td, int intrval) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (intrval == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { thread_unlock(td); return (0); } return (sleepq_abort(td, intrval)); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, false); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, then we forget about it * immediately, except when the target process executes * sigwait(). (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { if (kern_sig_discard_ign && (p->p_sysent->sv_flags & SV_SIG_DISCIGN) == 0) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } else { action = SIG_CATCH; intrval = 0; } } else { if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; } mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) != 0 && (p->p_pgrp->pg_flags & PGRP_ORPHANED) != 0 && action == SIG_DFL) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); wakeup_swapper = 0; /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out_cont; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out_cont; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ PROC_SLOCK(p); thread_lock(td); if (TD_CAN_ABORT(td)) wakeup_swapper = sig_sleepq_abort(td, intrval); else thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out_cont: itimer_proc_continue(p); kqtimer_proc_continue(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); if (wakeup_swapper) kick_proc0(); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop, wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sig_sleepq_abort(td, intrval); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); return; } /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif out: PROC_SUNLOCK(p); thread_unlock(td); } static void ptrace_coredump(struct thread *td) { struct proc *p; struct thr_coredump_req *tcq; void *rl_cookie; MPASS(td == curthread); p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if ((td->td_dbgflags & TDB_COREDUMPRQ) == 0) return; KASSERT((p->p_flag & P_STOPPED_TRACE) != 0, ("not stopped")); tcq = td->td_coredump; KASSERT(tcq != NULL, ("td_coredump is NULL")); if (p->p_sysent->sv_coredump == NULL) { tcq->tc_error = ENOSYS; goto wake; } PROC_UNLOCK(p); rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX); tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp, tcq->tc_limit, tcq->tc_flags); vn_rangelock_unlock(tcq->tc_vp, rl_cookie); PROC_LOCK(p); wake: td->td_dbgflags &= ~TDB_COREDUMPRQ; td->td_coredump = NULL; wakeup(p); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) { wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); continue; } } else if (!TD_IS_SUSPENDED(td2)) thread_suspend_one(td2); } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: td->td_dbgflags |= TDB_SSWITCH; thread_suspend_switch(td, p); td->td_dbgflags &= ~TDB_SSWITCH; if ((td->td_dbgflags & TDB_COREDUMPRQ) != 0) { PROC_SUNLOCK(p); ptrace_coredump(td); PROC_SLOCK(p); goto stopme; } if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; td2 = sigtd(p, td->td_xsig, false); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; bool fastblk, pslocked; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0; mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); fastblk = (flags & SIGPROCMASK_FASTBLK) != 0; SIG_FOREACH(sig, &block) { td = sigtd(p, sig, fastblk); /* * If sigtd() selected us despite sigfastblock is * blocking, do not activate AST or wake us, to avoid * loop in AST handler. */ if (fastblk && td == curthread) continue; signotify(td); if (!pslocked) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) { tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); } if (!pslocked) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } enum sigstatus { SIGSTATUS_HANDLE, SIGSTATUS_HANDLED, SIGSTATUS_IGNORE, SIGSTATUS_SBDRY_STOP, }; /* * The thread has signal "sig" pending. Figure out what to do with it: * * _HANDLE -> the caller should handle the signal * _HANDLED -> handled internally, reload pending signal set * _IGNORE -> ignored, remove from the set of pending signals and try the * next pending signal * _SBDRY_STOP -> the signal should stop the thread but this is not * permitted in the current context */ static enum sigstatus sigprocess(struct thread *td, int sig) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; ksiginfo_t ksi; int prop; KASSERT(_SIG_VALID(sig), ("%s: invalid signal %d", __func__, sig)); p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); /* * We should allow pending but ignored signals below * only if there is sigwait() active, or P_TRACED was * on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (p->p_flag & P_TRACED) == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { return (SIGSTATUS_IGNORE); } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) return (SIGSTATUS_HANDLED); /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); return (SIGSTATUS_HANDLED); } /* * If the traced bit got turned off, requeue the signal and * reload the set of pending signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); return (SIGSTATUS_HANDLED); } } /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif return (SIGSTATUS_IGNORE); } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ prop = sigprop(sig); if (prop & SIGPROP_STOP) { mtx_unlock(&ps->ps_mtx); if ((p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT)) != 0 || ((p->p_pgrp-> pg_flags & PGRP_ORPHANED) != 0 && (prop & SIGPROP_TTYSTOP) != 0)) { mtx_lock(&ps->ps_mtx); return (SIGSTATUS_IGNORE); } if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_SBDRY_STOP); } WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_HANDLED); } else if ((prop & SIGPROP_IGNORE) != 0 && (td->td_flags & TDF_SIGWAIT) == 0) { /* * Default action is to ignore; drop it if * not in kern_sigtimedwait(). */ return (SIGSTATUS_IGNORE); } else { return (SIGSTATUS_HANDLE); } case (intptr_t)SIG_IGN: if ((td->td_flags & TDF_SIGWAIT) == 0) return (SIGSTATUS_IGNORE); else return (SIGSTATUS_HANDLE); default: /* * This signal has an action, let postsig() process it. */ return (SIGSTATUS_HANDLE); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling * issignal by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; sigset_t sigpending; int sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); /* * Do fast sigblock if requested by usermode. Since * we do know that there was a signal pending at this * point, set the FAST_SIGBLOCK_PEND as indicator for * usermode to perform a dummy call to * FAST_SIGBLOCK_UNBLOCK, which causes immediate * delivery of postponed pending signal. */ if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { if (td->td_sigblock_val != 0) SIGSETNAND(sigpending, fastblock_mask); if (SIGISEMPTY(sigpending)) { td->td_pflags |= TDP_SIGFASTPENDING; return (0); } } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ td->td_dbgflags |= TDB_FSTP; SIGEMPTYSET(sigpending); SIGADDSET(sigpending, SIGSTOP); } SIG_FOREACH(sig, &sigpending) { switch (sigprocess(td, sig)) { case SIGSTATUS_HANDLE: return (sig); case SIGSTATUS_HANDLED: goto next; case SIGSTATUS_IGNORE: sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); break; case SIGSTATUS_SBDRY_STOP: return (-1); } } next:; } } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } int sig_ast_checksusp(struct thread *td) { struct proc *p __diagused; int ret; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); return (ret); } int sig_ast_needsigchk(struct thread *td) { struct proc *p; struct sigacts *ps; int ret, sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if ((td->td_flags & TDF_NEEDSIGCHK) == 0) return (0); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig == -1) { mtx_unlock(&ps->ps_mtx); KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); KASSERT(TD_SBDRY_INTR(td), ("lost TDF_SERESTART of TDF_SEINTR")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); ret = TD_SBDRY_ERRNO(td); } else if (sig != 0) { ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART; mtx_unlock(&ps->ps_mtx); } else { mtx_unlock(&ps->ps_mtx); ret = 0; } /* * Do not go into sleep if this thread was the ptrace(2) * attach leader. cursig() consumed SIGSTOP from PT_ATTACH, * but we usually act on the signal by interrupting sleep, and * should do that here as well. */ if ((td->td_dbgflags & TDB_FSTP) != 0) { if (ret == 0) ret = EINTR; td->td_dbgflags &= ~TDB_FSTP; } return (ret); } int sig_intr(void) { struct thread *td; struct proc *p; int ret; td = curthread; if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) return (0); p = td->td_proc; PROC_LOCK(p); ret = sig_ast_checksusp(td); if (ret == 0) ret = sig_ast_needsigchk(td); PROC_UNLOCK(p); return (ret); } bool curproc_sigkilled(void) { struct thread *td; struct proc *p; struct sigacts *ps; bool res; td = curthread; if ((td->td_flags & TDF_NEEDSIGCHK) == 0) return (false); p = td->td_proc; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); res = SIGISMEMBER(td->td_sigqueue.sq_signals, SIGKILL) || SIGISMEMBER(p->p_sigqueue.sq_signals, SIGKILL); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (res); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, const char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vn_close(oldvp, FWRITE, td->td_ucred, td); oldvp = vp; VOP_UNLOCK(oldvp); lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vn_close(oldvp, FWRITE, td->td_ucred, td); } else { nextvp = oldvp; error = vn_lock(nextvp, LK_EXCLUSIVE); if (error != 0) { vn_close(nextvp, FWRITE, td->td_ucred, td); nextvp = NULL; } } } else { vn_close(oldvp, FWRITE, td->td_ucred, td); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; size_t fullpathsize; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp); error = EFAULT; goto out; } VOP_UNLOCK(vp); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpathsize = MAXPATHLEN; freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) { free(freepath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3 || (p->p_pid == 1 && (kern_lognosys & 3) == 0)) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } void sig_drop_caught(struct proc *p) { int sig; struct sigacts *ps; ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&ps->ps_mtx, MA_OWNED); SIG_FOREACH(sig, &ps->ps_sigcatch) { sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } } static void sigfastblock_failed(struct thread *td, bool sendsig, bool write) { ksiginfo_t ksi; /* * Prevent further fetches and SIGSEGVs, allowing thread to * issue syscalls despite corruption. */ sigfastblock_clear(td); if (!sendsig) return; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGSEGV; ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR; ksi.ksi_addr = td->td_sigblock_ptr; trapsignal(td, &ksi); } static bool sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp) { uint32_t res; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return (true); if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) { sigfastblock_failed(td, sendsig, false); return (false); } *valp = res; td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS; return (true); } static void sigfastblock_resched(struct thread *td, bool resched) { struct proc *p; if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, td->td_sigmask, 0); PROC_UNLOCK(p); } thread_lock(td); td->td_flags |= TDF_ASTPENDING | TDF_NEEDSIGCHK; thread_unlock(td); } int sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap) { struct proc *p; int error, res; uint32_t oldval; error = 0; p = td->td_proc; switch (uap->cmd) { case SIGFASTBLOCK_SETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { error = EBUSY; break; } if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) { error = EINVAL; break; } td->td_pflags |= TDP_SIGFASTBLOCK; td->td_sigblock_ptr = uap->ptr; break; case SIGFASTBLOCK_UNBLOCK: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } for (;;) { res = casueword32(td->td_sigblock_ptr, SIGFASTBLOCK_PEND, &oldval, 0); if (res == -1) { error = EFAULT; sigfastblock_failed(td, false, true); break; } if (res == 0) break; MPASS(res == 1); if (oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) break; /* * td_sigblock_val is cleared there, but not on a * syscall exit. The end effect is that a single * interruptible sleep, while user sigblock word is * set, might return EINTR or ERESTART to usermode * without delivering signal. All further sleeps, * until userspace clears the word and does * sigfastblock(UNBLOCK), observe current word and no * longer get interrupted. It is slight * non-conformance, with alternative to have read the * sigblock word on each syscall entry. */ td->td_sigblock_val = 0; /* * Rely on normal ast mechanism to deliver pending * signals to current thread. But notify others about * fake unblock. */ sigfastblock_resched(td, error == 0 && p->p_numthreads != 1); break; case SIGFASTBLOCK_UNSETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } if (!sigfastblock_fetch_sig(td, false, &oldval)) { error = EFAULT; break; } if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } sigfastblock_clear(td); break; default: error = EINVAL; break; } return (error); } void sigfastblock_clear(struct thread *td) { bool resched; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return; td->td_sigblock_val = 0; resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 || SIGPENDING(td); td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING); sigfastblock_resched(td, resched); } void sigfastblock_fetch(struct thread *td) { uint32_t val; (void)sigfastblock_fetch_sig(td, true, &val); } static void sigfastblock_setpend1(struct thread *td) { int res; uint32_t oldval; if ((td->td_pflags & TDP_SIGFASTPENDING) == 0) return; res = fueword32((void *)td->td_sigblock_ptr, &oldval); if (res == -1) { sigfastblock_failed(td, true, false); return; } for (;;) { res = casueword32(td->td_sigblock_ptr, oldval, &oldval, oldval | SIGFASTBLOCK_PEND); if (res == -1) { sigfastblock_failed(td, true, true); return; } if (res == 0) { td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS; td->td_pflags &= ~TDP_SIGFASTPENDING; break; } MPASS(res == 1); if (thread_check_susp(td, false) != 0) break; } } void sigfastblock_setpend(struct thread *td, bool resched) { struct proc *p; sigfastblock_setpend1(td); if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK); PROC_UNLOCK(p); } } diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index bb179043682e..d4db19598392 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -1,1791 +1,1791 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_kern_tls.h" #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function pointer set by the AIO routines so that the socket buffer code * can call back into the AIO module if it is loaded. */ void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on socket buffers */ u_long sb_max = SB_MAX; u_long sb_max_adj = (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */ static u_long sb_efficiency = 8; /* parameter for sbreserve() */ #ifdef KERN_TLS static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); #endif static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); /* * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. */ static void sbm_clrprotoflags(struct mbuf *m, int flags) { int mask; mask = ~M_PROTOFLAGS; if (flags & PRUS_NOTREADY) mask |= M_NOTREADY; while (m) { m->m_flags &= mask; m = m->m_next; } } /* * Compress M_NOTREADY mbufs after they have been readied by sbready(). * * sbcompress() skips M_NOTREADY mbufs since the data is not available to * be copied at the time of sbcompress(). This function combines small * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first * mbuf sbready() marked ready, and 'end' is the first mbuf still not * ready. */ static void sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) { struct mbuf *m, *n; int ext_size; SOCKBUF_LOCK_ASSERT(sb); if ((sb->sb_flags & SB_NOCOALESCE) != 0) return; for (m = m0; m != end; m = m->m_next) { MPASS((m->m_flags & M_NOTREADY) == 0); /* * NB: In sbcompress(), 'n' is the last mbuf in the * socket buffer and 'm' is the new mbuf being copied * into the trailing space of 'n'. Here, the roles * are reversed and 'n' is the next mbuf after 'm' * that is being copied into the trailing space of * 'm'. */ n = m->m_next; #ifdef KERN_TLS /* Try to coalesce adjacent ktls mbuf hdr/trailers. */ if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && (m->m_flags & M_EXTPG) && (n->m_flags & M_EXTPG) && !mbuf_has_tls_session(m) && !mbuf_has_tls_session(n)) { int hdr_len, trail_len; hdr_len = n->m_epg_hdrlen; trail_len = m->m_epg_trllen; if (trail_len != 0 && hdr_len != 0 && trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) { /* copy n's header to m's trailer */ memcpy(&m->m_epg_trail[trail_len], n->m_epg_hdr, hdr_len); m->m_epg_trllen += hdr_len; m->m_len += hdr_len; n->m_epg_hdrlen = 0; n->m_len -= hdr_len; } } #endif /* Compress small unmapped mbufs into plain mbufs. */ if ((m->m_flags & M_EXTPG) && m->m_len <= MLEN && !mbuf_has_tls_session(m)) { ext_size = m->m_ext.ext_size; if (mb_unmapped_compress(m) == 0) { sb->sb_mbcnt -= ext_size; sb->sb_ccnt -= 1; } } while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && M_WRITABLE(m) && (m->m_flags & M_EXTPG) == 0 && !mbuf_has_tls_session(n) && !mbuf_has_tls_session(m) && n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ n->m_len <= M_TRAILINGSPACE(m) && m->m_type == n->m_type) { KASSERT(sb->sb_lastrecord != n, ("%s: merging start of record (%p) into previous mbuf (%p)", __func__, n, m)); m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); m->m_len += n->m_len; m->m_next = n->m_next; m->m_flags |= n->m_flags & M_EOR; if (sb->sb_mbtail == n) sb->sb_mbtail = m; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (n->m_flags & M_EXT) { sb->sb_mbcnt -= n->m_ext.ext_size; sb->sb_ccnt -= 1; } m_free(n); n = m->m_next; } } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); } /* * Mark ready "count" units of I/O starting with "m". Most mbufs * count as a single unit of I/O except for M_EXTPG mbufs which * are backed by multiple pages. */ int sbready(struct sockbuf *sb, struct mbuf *m0, int count) { struct mbuf *m; u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); m = m0; blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); if ((m->m_flags & M_EXTPG) != 0 && m->m_epg_npgs != 0) { if (count < m->m_epg_nrdy) { m->m_epg_nrdy -= count; count = 0; break; } count -= m->m_epg_nrdy; m->m_epg_nrdy = 0; } else count--; m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; m = m->m_next; } /* * If the first mbuf is still not fully ready because only * some of its backing pages were readied, no further progress * can be made. */ if (m0 == m) { MPASS(m->m_flags & M_NOTREADY); return (EINPROGRESS); } if (!blocker) { sbready_compress(sb, m0, m); return (EINPROGRESS); } /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { KASSERT(m->m_flags & M_BLOCKED, ("%s: m %p !M_BLOCKED", __func__, m)); m->m_flags &= ~M_BLOCKED; sb->sb_acc += m->m_len; } sb->sb_fnrdy = m; sbready_compress(sb, m0, m); return (0); } /* * Adjust sockbuf state reflecting allocation of m. */ void sballoc(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) { if (m->m_flags & M_NOTREADY) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; } else m->m_flags |= M_BLOCKED; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; sb->sb_mbcnt += MSIZE; sb->sb_mcnt += 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; sb->sb_ccnt += 1; } } /* * Adjust sockbuf state reflecting freeing of m. */ void sbfree(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; if (!(m->m_flags & M_NOTAVAIL)) sb->sb_acc -= m->m_len; if (m == sb->sb_fnrdy) { struct mbuf *n; KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= m->m_len; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; sb->sb_ccnt -= 1; } if (sb->sb_sndptr == m) { sb->sb_sndptr = NULL; sb->sb_sndptroff = 0; } if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= m->m_len; } #ifdef KERN_TLS /* * Similar to sballoc/sbfree but does not adjust state associated with * the sb_mb chain such as sb_fnrdy or sb_sndptr*. Also assumes mbufs * are not ready. */ void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mbcnt += MSIZE; sb->sb_mcnt += 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt += m->m_ext.ext_size; sb->sb_ccnt += 1; } } void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; sb->sb_tlscc -= m->m_len; sb->sb_mbcnt -= MSIZE; sb->sb_mcnt -= 1; if (m->m_flags & M_EXT) { sb->sb_mbcnt -= m->m_ext.ext_size; sb->sb_ccnt -= 1; } } #endif /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system * that no more data is to be sent, by the protocol code (in case * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be * received, and will normally be applied to the socket by a protocol when it * detects that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantsendmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_snd); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantsendmore(struct socket *so) { SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED); } void socantrcvmore_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); so->so_rcv.sb_state |= SBS_CANTRCVMORE; #ifdef KERN_TLS if (so->so_rcv.sb_flags & SB_TLS_RX) ktls_check_rx(&so->so_rcv); #endif sorwakeup_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void socantrcvmore(struct socket *so) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void soroverflow_locked(struct socket *so) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_options & SO_RERROR) { so->so_rerror = ENOBUFS; sorwakeup_locked(so); } else SOCKBUF_UNLOCK(&so->so_rcv); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } void soroverflow(struct socket *so) { SOCKBUF_LOCK(&so->so_rcv); soroverflow_locked(so); mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags |= SB_WAIT; return (msleep_sbt(&sb->sb_acc, SOCKBUF_MTX(sb), (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo, 0, 0)); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ void sowakeup(struct socket *so, struct sockbuf *sb) { int ret; SOCKBUF_LOCK_ASSERT(sb); selwakeuppri(sb->sb_sel, PSOCK); if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); soupcall_clear(so, SO_RCV); } } else ret = SU_OK; if (sb->sb_flags & SB_AIO) sowakeup_aio(so, sb); SOCKBUF_UNLOCK(sb); if (ret == SU_ISCONNECTED) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and one for * receiving data. Each buffer contains a queue of mbufs, information about * the number of mbufs and amount of data in the queue, and other fields * allowing select() statements and notification on data availability to be * implemented. * * Data stored in a socket buffer is maintained as a list of records. Each * record is a list of mbufs chained together with the m_next field. Records * are chained together with the m_nextpkt field. The upper level routine * soreceive() expects the following conventions to be observed when placing * information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's name, * then a record containing that name must be present before any * associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really just * additional data associated with the message), and there are ``rights'' * to be received, then a record containing this data should be present * (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by a data * record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space should * be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { struct thread *td = curthread; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0) goto bad; if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (0); bad2: sbrelease_locked(&so->so_snd, so); bad: SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_sb_max = sb_max; error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); if (error || !req->newptr) return (error); if (tmp_sb_max < MSIZE + MCLBYTES) return (EINVAL); sb_max = tmp_sb_max; sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES); return (0); } /* * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't * become limiting if buffering efficiency is near the normal case. */ int sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, struct thread *td) { rlim_t sbsize_limit; SOCKBUF_LOCK_ASSERT(sb); /* * When a thread is passed, we take into account the thread's socket * buffer size limit. The caller will generally pass curthread, but * in the TCP input path, NULL will be passed to indicate that no * appropriate thread resource limits are available. In that case, * we don't apply a process limit. */ if (cc > sb_max_adj) return (0); if (td != NULL) { sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (0); sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); } int sbsetopt(struct socket *so, int cmd, u_long cc) { struct sockbuf *sb; short *flags; u_int *hiwat, *lowat; int error; sb = NULL; SOCK_LOCK(so); if (SOLISTENING(so)) { switch (cmd) { case SO_SNDLOWAT: case SO_SNDBUF: lowat = &so->sol_sbsnd_lowat; hiwat = &so->sol_sbsnd_hiwat; flags = &so->sol_sbsnd_flags; break; case SO_RCVLOWAT: case SO_RCVBUF: lowat = &so->sol_sbrcv_lowat; hiwat = &so->sol_sbrcv_hiwat; flags = &so->sol_sbrcv_flags; break; } } else { switch (cmd) { case SO_SNDLOWAT: case SO_SNDBUF: sb = &so->so_snd; break; case SO_RCVLOWAT: case SO_RCVBUF: sb = &so->so_rcv; break; } flags = &sb->sb_flags; hiwat = &sb->sb_hiwat; lowat = &sb->sb_lowat; SOCKBUF_LOCK(sb); } error = 0; switch (cmd) { case SO_SNDBUF: case SO_RCVBUF: if (SOLISTENING(so)) { if (cc > sb_max_adj) { error = ENOBUFS; break; } *hiwat = cc; if (*lowat > *hiwat) *lowat = *hiwat; } else { if (!sbreserve_locked(sb, cc, so, curthread)) error = ENOBUFS; } if (error == 0) *flags &= ~SB_AUTOSIZE; break; case SO_SNDLOWAT: case SO_RCVLOWAT: /* * Make sure the low-water is never greater than the * high-water. */ *lowat = (cc > *hiwat) ? *hiwat : cc; break; } if (!SOLISTENING(so)) SOCKBUF_UNLOCK(sb); SOCK_UNLOCK(so); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ void sbrelease_internal(struct sockbuf *sb, struct socket *so) { sbflush_internal(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease_locked(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK_ASSERT(sb); sbrelease_internal(sb, so); } void sbrelease(struct sockbuf *sb, struct socket *so) { SOCKBUF_LOCK(sb); sbrelease_locked(sb, so); SOCKBUF_UNLOCK(sb); } void sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) ktls_free(sb->sb_tls_info); sb->sb_tls_info = NULL; #endif } /* * Routines to add and remove data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to append * new mbufs to a socket buffer, after checking that adequate space is * available, comparing the function sbspace() with the amount of data to be * added. sbappendrecord() differs from sbappend() in that data supplied is * treated as the beginning of a new record. To place a sender's address, * optional access rights, and data in a socket receive buffer, * sbappendaddr() should be used. To place access rights and data in a * socket receive buffer, sbappendrights() should be used. In either case, * the new data begins a new record. Note that unlike sbappend() and * sbappendrecord(), these routines check for the caller that there will be * enough space to store the data. Each fails if there is not enough space, * or if it cannot find mbufs to store additional information in. * * Reliable protocols may use the socket send buffer to hold data awaiting * acknowledgement. Data is normally copied from a socket send buffer in a * protocol with m_copy for output to a peer, and then removing the data from * the socket buffer with sbdrop() or sbdroprecord() when the data is * acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } #ifdef KERN_TLS m = sb->sb_mtls; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mtlstail) { printf("%s: sb_mtls %p sb_mtlstail %p last %p\n", __func__, sb->sb_mtls, sb->sb_mtlstail, m); printf("TLS packet tree:\n"); printf("\t"); for (m = sb->sb_mtls; m != NULL; m = m->m_next) { printf("%p ", m); } printf("\n"); panic("%s from %s:%u", __func__, file, line); } #endif } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) { struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == NULL) return; sbm_clrprotoflags(m, flags); SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappend_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef KERN_TLS /* * Append an mbuf containing encrypted TLS data. The data * is marked M_NOTREADY until it has been decrypted and * stored as a TLS record. */ static void sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m) { struct mbuf *n; SBLASTMBUFCHK(sb); /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, 0); for (n = m; n != NULL; n = n->m_next) n->m_flags |= M_NOTREADY; sbcompress_ktls_rx(sb, m, sb->sb_mtlstail); ktls_check_rx(sb); } #endif /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); #ifdef KERN_TLS /* * Decrypted TLS records are appended as records via * sbappendrecord(). TCP passes encrypted TLS records to this * function which must be scheduled for decryption. */ if (sb->sb_flags & SB_TLS_RX) { sbappend_ktls_rx(sb, m); return; } #endif KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) ktls_seq(sb, m); #endif /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; #ifdef KERN_TLS u_long tlscc; #endif SOCKBUF_LOCK_ASSERT(sb); acc = ccc = mbcnt = 0; fnrdy = NULL; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { if (m->m_len == 0) { printf("sb %p empty mbuf %p\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { if (m != sb->sb_fnrdy) { printf("sb %p: fnrdy %p != m %p\n", sb, sb->sb_fnrdy, m); goto fail; } fnrdy = m; } if (fnrdy) { if (!(m->m_flags & M_NOTAVAIL)) { printf("sb %p: fnrdy %p, m %p is avail\n", sb, sb->sb_fnrdy, m); goto fail; } } else acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } #ifdef KERN_TLS /* * Account for mbufs "detached" by ktls_detach_record() while * they are decrypted by ktls_decrypt(). tlsdcc gives a count * of the detached bytes that are included in ccc. The mbufs * and clusters are not included in the socket buffer * accounting. */ ccc += sb->sb_tlsdcc; tlscc = 0; for (m = sb->sb_mtls; m; m = m->m_next) { if (m->m_nextpkt != NULL) { printf("sb %p TLS mbuf %p with nextpkt\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) == 0) { printf("sb %p TLS mbuf %p ready\n", sb, m); goto fail; } tlscc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } if (sb->sb_tlscc != tlscc) { printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, sb->sb_tlsdcc); goto fail; } #endif if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); #ifdef KERN_TLS printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, sb->sb_tlsdcc); #endif goto fail; } return; fail: panic("%s from %s:%u", __func__, file, line); } #endif /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == NULL) return; m_clrprotoflags(m0); /* * Put the first mbuf on the queue. Note this permits zero length * records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } /* always call sbcompress() so it can do SBLASTMBUFCHK() */ sbcompress(sb, m, m0); } /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* Helper routine that appends data, control, and address to a sockbuf. */ static int sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) { struct mbuf *m, *n, *nlast; #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif m = m_get(M_NOWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (m0) { m_clrprotoflags(m0); m_tag_delete_chain(m0, NULL); /* * Clear some persistent info from pkthdr. * We don't use m_demote(), because some netgraph consumers * expect M_PKTHDR presence. */ m0->m_pkthdr.rcvif = NULL; m0->m_pkthdr.flowid = 0; m0->m_pkthdr.csum_flags = 0; m0->m_pkthdr.fibnum = 0; m0->m_pkthdr.rsstype = 0; } if (ctrl_last) ctrl_last->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if insufficient mbufs. Does not validate space * on the receiving sockbuf. */ int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; SOCKBUF_LOCK_ASSERT(sb); ctrl_last = (control == NULL) ? NULL : m_last(control); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } void sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags) { struct mbuf *m, *mlast; sbm_clrprotoflags(m0, flags); m_last(control)->m_next = m0; SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); } void sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags) { SOCKBUF_LOCK(sb); sbappendcontrol_locked(sb, m0, control, flags); SOCKBUF_UNLOCK(sb); } /* * Append the data in mbuf chain (m) into the socket buffer sb following mbuf * (n). If (n) is NULL, the buffer is presumed empty. * * When the data is compressed, mbufs in the chain may be handled in one of * three ways: * * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no * record boundary, and no change in data type). * * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into * an mbuf already in the socket buffer. This can occur if an * appropriate mbuf exists, there is room, both mbufs are not marked as * not ready, and no merging of data types will occur. * * (3) The mbuf may be appended to the end of the existing mbuf chain. * * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as * end-of-record. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && !(n->m_flags & (M_NOTREADY | M_EXTPG)) && !mbuf_has_tls_session(m) && !mbuf_has_tls_session(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) sb->sb_acc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (m->m_len <= MLEN && (m->m_flags & M_EXTPG) && (m->m_flags & M_NOTREADY) == 0 && !mbuf_has_tls_session(m)) (void)mb_unmapped_compress(m); if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); n->m_flags |= eor; } SBLASTMBUFCHK(sb); } #ifdef KERN_TLS /* * A version of sbcompress() for encrypted TLS RX mbufs. These mbufs * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also * a bit simpler (no EOR markers, always MT_DATA, etc.). */ static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { SOCKBUF_LOCK_ASSERT(sb); while (m) { KASSERT((m->m_flags & M_EOR) == 0, ("TLS RX mbuf %p with EOR", m)); KASSERT(m->m_type == MT_DATA, ("TLS RX mbuf %p is not MT_DATA", m)); KASSERT((m->m_flags & M_NOTREADY) != 0, ("TLS RX mbuf %p ready", m)); KASSERT((m->m_flags & M_EXTPG) == 0, ("TLS RX mbuf %p unmapped", m)); if (m->m_len == 0) { m = m_free(m); continue; } /* * Even though both 'n' and 'm' are NOTREADY, it's ok * to coalesce the data. */ if (n && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(n->m_flags & (M_EXTPG)) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n)) { m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; sb->sb_tlscc += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mtls = m; sb->sb_mtlstail = m; sballoc_ktls_rx(sb, m); n = m; m = m->m_next; n->m_next = NULL; } SBLASTMBUFCHK(sb); } #endif /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ static void sbflush_internal(struct sockbuf *sb) { while (sb->sb_mbcnt || sb->sb_tlsdcc) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); } KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); } void sbflush_locked(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sbflush_internal(sb); } void sbflush(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Cut data from (the front of) a sockbuf. */ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; bool is_tls; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u", __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; is_tls = false; mfree = NULL; while (len > 0) { if (m == NULL) { #ifdef KERN_TLS if (next == NULL && !is_tls) { if (sb->sb_tlsdcc != 0) { MPASS(len >= sb->sb_tlsdcc); len -= sb->sb_tlsdcc; sb->sb_ccc -= sb->sb_tlsdcc; sb->sb_tlsdcc = 0; if (len == 0) break; } next = sb->sb_mtls; is_tls = true; } #endif KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; } if (m->m_len > len) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p M_NOTAVAIL", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; sb->sb_acc -= len; if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; #ifdef KERN_TLS if (is_tls) sbfree_ktls_rx(sb, m); else #endif sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ if (m->m_flags & M_NOTREADY && !is_tls) m = m->m_next; else { struct mbuf *n; n = m->m_next; m->m_next = mfree; mfree = m; m = n; } } /* * Free any zero-length mbufs from the buffer. * For SOCK_DGRAM sockets such mbufs represent empty records. * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, * when sosend_generic() needs to send only control data. */ while (m && m->m_len == 0) { struct mbuf *n; sbfree(sb, m); n = m->m_next; m->m_next = mfree; mfree = m; m = n; } #ifdef KERN_TLS if (is_tls) { sb->sb_mb = NULL; sb->sb_mtls = m; if (m == NULL) sb->sb_mtlstail = NULL; } else #endif if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure * sb_lastrecord is up-to-date if we dropped part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } return (mfree); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); m_freem(sbcut_internal(sb, len)); } /* * Drop data from (the front of) a sockbuf, * and return it to caller. */ struct mbuf * sbcut_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); return (sbcut_internal(sb, len)); } void sbdrop(struct sockbuf *sb, int len) { struct mbuf *mfree; SOCKBUF_LOCK(sb); mfree = sbcut_internal(sb, len); SOCKBUF_UNLOCK(sb); m_freem(mfree); } struct mbuf * sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { *moff = off; if (sb->sb_sndptr == NULL) { sb->sb_sndptr = sb->sb_mb; sb->sb_sndptroff = 0; } return (sb->sb_mb); } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } *moff = off; return (m); } void sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len) { /* * A small copy was done, advance forward the sb_sbsndptr to cover * it. */ struct mbuf *m; if (mb != sb->sb_sndptr) { /* Did not copyout at the same mbuf */ return; } m = mb; while (m && (len > 0)) { if (len >= m->m_len) { len -= m->m_len; if (m->m_next) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } m = m->m_next; } else { len = 0; } } } /* * Return the first mbuf and the mbuf data offset for the provided * send offset without changing the "sb_sndptroff" field. */ struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); /* * If the "off" is below the stored offset, which happens on * retransmits, just use "sb_mb": */ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { m = sb->sb_mb; } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } while (off > 0 && m != NULL) { if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } *moff = off; return (m); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord_locked(struct sockbuf *sb) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data with the specified * type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol_how(void *p, int size, int type, int level, int wait) { struct cmsghdr *cp; struct mbuf *m; MBUF_CHECKSLEEP(wait); if (CMSG_SPACE((u_int)size) > MCLBYTES) return ((struct mbuf *) NULL); if (CMSG_SPACE((u_int)size) > MLEN) m = m_getcl(wait, MT_CONTROL, 0); else m = m_get(wait, MT_CONTROL); if (m == NULL) return ((struct mbuf *) NULL); cp = mtod(m, struct cmsghdr *); m->m_len = 0; KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); /* * Don't leave the padding between the msg header and the * cmsg data and the padding after the cmsg data un-initialized. */ bzero(cp, CMSG_SPACE((u_int)size)); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } struct mbuf * sbcreatecontrol(caddr_t p, int size, int type, int level) { return (sbcreatecontrol_how(p, size, type, level, M_NOWAIT)); } /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note * that the xsockbuf structure, since it is always embedded in a socket, does * not include a self pointer nor a length. We make this entry point public * in case some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mcnt = sb->sb_mcnt; xsb->sb_ccnt = sb->sb_ccnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW | CTLFLAG_SKIP, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, - CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &sb_max, 0, + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, "Socket buffer size waste factor"); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 5d2139cb4a31..1fd63640988b 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4463 +1,4463 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, - CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, - CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; bool last __diagused; SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_NOFDREF | SS_PROTOREF)) != SS_NOFDREF || refcount_load(&so->so_count) != 0 || so->so_qstate == SQ_COMP) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; last = refcount_release(&sol->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, sol)); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(refcount_load(&so->so_count) == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; struct socket *sp, *tsp; int error = 0; bool last __diagused; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); TAILQ_INIT(&lqueue); SOCK_LOCK(so); if (SOLISTENING(so)) { TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (refcount_load(&sp->so_count) == 0) { SOCK_UNLOCK(sp); soabort(sp); } else { /* See the handling of queued sockets in sofree(). */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; /* XXXMJ racy */ if (SOLISTENING(so)) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for a non-application data * record. If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* This will need to change for TLS 1.3. */ if (tgr.tls_type != TLS_RLTYPE_APP) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; int error; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); KASSERT(error == 0, ("%s: cannot lock sock %p recv buffer", __func__, so)); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { if (control != NULL) m_freem(control); if ((flags & PRUS_NOTREADY) == 0) m_freem(m); return (EOPNOTSUPP); } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); }