diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index ac086618b8b1..f3755d625572 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1,1612 +1,1619 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct rwlock uihashtbl_lock; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); static int donice(struct thread *td, struct proc *chgp, int n); static struct uidinfo *uilookup(uid_t uid); static void ruxagg_ext_locked(struct rusage_ext *rux, struct thread *td); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int sys_getpriority(struct thread *td, struct getpriority_args *uap) { return (kern_getpriority(td, uap->which, uap->who)); } int kern_getpriority(struct thread *td, int which, int who) { struct proc *p; struct pgrp *pg; int error, low; error = 0; low = PRIO_MAX + 1; switch (which) { case PRIO_PROCESS: if (who == 0) low = td->td_proc->p_nice; else { p = pfind(who); if (p == NULL) break; if (p_cansee(td, p) == 0) low = p->p_nice; PROC_UNLOCK(p); } break; case PRIO_PGRP: sx_slock(&proctree_lock); if (who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (who == 0) who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0 && p->p_ucred->cr_uid == who) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif int sys_setpriority(struct thread *td, struct setpriority_args *uap) { return (kern_setpriority(td, uap->which, uap->who, uap->prio)); } int kern_setpriority(struct thread *td, int which, int who, int prio) { struct proc *curp, *p; struct pgrp *pg; int found = 0, error = 0; curp = td->td_proc; switch (which) { case PRIO_PROCESS: if (who == 0) { PROC_LOCK(curp); error = donice(td, curp, prio); PROC_UNLOCK(curp); } else { p = pfind(who); if (p == NULL) break; error = p_cansee(td, p); if (error == 0) error = donice(td, p, prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: sx_slock(&proctree_lock); if (who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { error = donice(td, p, prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (who == 0) who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p->p_ucred->cr_uid == who && p_cansee(td, p) == 0) { error = donice(td, p, prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); sched_nice(p, n); return (0); } static int unprivileged_idprio; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW, &unprivileged_idprio, 0, "Allow non-root users to set an idle priority (deprecated)"); /* * Set realtime priority for LWP. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_thread_args { int function; lwpid_t lwpid; struct rtprio *rtp; }; #endif int sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap) { struct proc *p; struct rtprio rtp; struct thread *td1; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->lwpid == 0 || uap->lwpid == td->td_tid) { p = td->td_proc; td1 = td; PROC_LOCK(p); } else { td1 = tdfind(uap->lwpid, -1); if (td1 == NULL) return (ESRCH); p = td1->td_proc; } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; pri_to_rtp(td1, &rtp); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ /* * Realtime priority has to be restricted for reasons which * should be obvious. However, for idleprio processes, there is * a potential for system deadlock if an idleprio process gains * a lock on a resource that other processes need (and the * idleprio process can't run due to a CPU-bound normal * process). Fix me! XXX * * This problem is not only related to idleprio process. * A user level program can obtain a file lock and hold it * indefinitely. Additionally, without idleprio processes it is * still conceivable that a program with low priority will never * get to run. In short, allowing this feature might make it * easier to lock a resource indefinitely, but it is not the * only thing that makes it possible. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME && (error = priv_check(td, PRIV_SCHED_RTPRIO)) != 0) break; if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && unprivileged_idprio == 0 && (error = priv_check(td, PRIV_SCHED_IDPRIO)) != 0) break; error = rtp_to_pri(&rtp, td1); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } /* * Set realtime priority. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif int sys_rtprio(struct thread *td, struct rtprio_args *uap) { struct proc *p; struct thread *tdp; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_THREAD_IN_PROC(p, tdp) { pri_to_rtp(tdp, &rtp2); if (rtp2.type < rtp.type || (rtp2.type == rtp.type && rtp2.prio < rtp.prio)) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* * Disallow setting rtprio in most cases if not superuser. * See the comment in sys_rtprio_thread about idprio * threads holding a lock. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME && (error = priv_check(td, PRIV_SCHED_RTPRIO)) != 0) break; if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && unprivileged_idprio == 0 && (error = priv_check(td, PRIV_SCHED_IDPRIO)) != 0) break; /* * If we are setting our own priority, set just our * thread but if we are doing another process, * do all the threads on that process. If we * specify our own pid we do the latter. */ if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { FOREACH_THREAD_IN_PROC(p, td) { if ((error = rtp_to_pri(&rtp, td)) != 0) break; } } break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct thread *td) { u_char newpri, oldclass, oldpri; switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) return (EINVAL); newpri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } thread_lock(td); oldclass = td->td_pri_class; sched_class(td, rtp->type); /* XXX fix */ oldpri = td->td_user_pri; sched_user_prio(td, newpri); if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL || td->td_pri_class != RTP_PRIO_NORMAL)) sched_prio(td, td->td_user_pri); if (TD_ON_UPILOCK(td) && oldpri != newpri) { critical_enter(); thread_unlock(td); umtx_pi_adjust(td, oldpri); critical_exit(); } else thread_unlock(td); return (0); } void pri_to_rtp(struct thread *td, struct rtprio *rtp) { thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = td->td_pri_class; thread_unlock(td); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int osetrlimit(struct thread *td, struct osetrlimit_args *uap) { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int ogetrlimit(struct thread *td, struct ogetrlimit_args *uap) { struct orlimit olim; struct rlimit rl; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rl); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct setrlimit_args { u_int which; struct rlimit *rlp; }; #endif int sys_setrlimit(struct thread *td, struct setrlimit_args *uap) { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } static void lim_cb(void *arg) { struct rlimit rlim; struct thread *td; struct proc *p; p = arg; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if the process exceeds its cpu resource allocation. If * it reaches the max, arrange to kill the process in ast(). */ if (p->p_cpulimit == RLIM_INFINITY) return; PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); } PROC_STATUNLOCK(p); if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) { lim_rlimit_proc(p, RLIMIT_CPU, &rlim); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim.rlim_max) p->p_cpulimit += 5; kern_psignal(p, SIGXCPU); } } if ((p->p_flag & P_WEXIT) == 0) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); } int kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp) { return (kern_proc_setrlimit(td, td->td_proc, which, limp)); } int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp) { struct plimit *newlim, *oldlim, *oldlim_td; struct rlimit *alimp; struct rlimit oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz.rlim_cur = 0; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: if (limp->rlim_cur != RLIM_INFINITY && p->p_cpulimit == RLIM_INFINITY) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); p->p_cpulimit = limp->rlim_cur; break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = *alimp; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(&oldssiz, RLIMIT_STACK); break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(limp, which); *alimp = *limp; p->p_limit = newlim; PROC_UPDATE_COW(p); oldlim_td = NULL; if (td == curthread && PROC_COW_CHANGECOUNT(td, p) == 1) { oldlim_td = lim_cowsync(); thread_cow_synced(td); } PROC_UNLOCK(p); if (oldlim_td != NULL) { MPASS(oldlim_td == oldlim); lim_freen(oldlim, 2); } else { lim_free(oldlim); } if (which == RLIMIT_STACK && /* * Skip calls from exec_new_vmspace(), done when stack is * not mapped yet. */ (td != curthread || (p->p_flag & P_INEXEC) == 0)) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz.rlim_cur) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > oldssiz.rlim_cur) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz.rlim_cur; addr = round_page(p->p_vmspace->vm_stacktop) - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz.rlim_cur - limp->rlim_cur; addr = round_page(p->p_vmspace->vm_stacktop) - oldssiz.rlim_cur; } addr = trunc_page(addr); size = round_page(size); (void)vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 0, VM_MAP_PROTECT_SET_PROT); } } return (0); } #ifndef _SYS_SYSPROTO_H_ struct getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int sys_getrlimit(struct thread *td, struct getrlimit_args *uap) { struct rlimit rlim; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rlim); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return (error); } /* * Transform the running time and tick information for children of proc p * into user and system time usage. */ void calccru(struct proc *p, struct timeval *up, struct timeval *sp) { PROC_LOCK_ASSERT(p, MA_OWNED); calcru1(p, &p->p_crux, up, sp); } /* * Transform the running time and tick information in proc p into user * and system time usage. If appropriate, include the current time slice * on this CPU. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp) { struct thread *td; uint64_t runtime, u; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_STATLOCK_ASSERT(p, MA_OWNED); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ td = curthread; if (td->td_proc == p) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } /* Make sure the per-thread stats are current. */ FOREACH_THREAD_IN_PROC(p, td) { if (td->td_incruntime == 0) continue; ruxagg(p, td); } calcru1(p, &p->p_rux, up, sp); } /* Collect resource usage for a single thread. */ void rufetchtd(struct thread *td, struct rusage *ru) { struct proc *p; uint64_t runtime, u; p = td->td_proc; PROC_STATLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we are getting stats for the current thread, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ if (td == curthread) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } ruxagg_locked(p, td); *ru = td->td_ru; calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime); } static uint64_t mul64_by_fraction(uint64_t a, uint64_t b, uint64_t c) { uint64_t acc, bh, bl; int i, s, sa, sb; /* * Calculate (a * b) / c accurately enough without overflowing. c * must be nonzero, and its top bit must be 0. a or b must be * <= c, and the implementation is tuned for b <= c. * * The comments about times are for use in calcru1() with units of * microseconds for 'a' and stathz ticks at 128 Hz for b and c. * * Let n be the number of top zero bits in c. Each iteration * either returns, or reduces b by right shifting it by at least n. * The number of iterations is at most 1 + 64 / n, and the error is * at most the number of iterations. * * It is very unusual to need even 2 iterations. Previous * implementations overflowed essentially by returning early in the * first iteration, with n = 38 giving overflow at 105+ hours and * n = 32 giving overlow at at 388+ days despite a more careful * calculation. 388 days is a reasonable uptime, and the calculation * needs to work for the uptime times the number of CPUs since 'a' * is per-process. */ if (a >= (uint64_t)1 << 63) return (0); /* Unsupported arg -- can't happen. */ acc = 0; for (i = 0; i < 128; i++) { sa = flsll(a); sb = flsll(b); if (sa + sb <= 64) /* Up to 105 hours on first iteration. */ return (acc + (a * b) / c); if (a >= c) { /* * This reduction is based on a = q * c + r, with the * remainder r < c. 'a' may be large to start, and * moving bits from b into 'a' at the end of the loop * sets the top bit of 'a', so the reduction makes * significant progress. */ acc += (a / c) * b; a %= c; sa = flsll(a); if (sa + sb <= 64) /* Up to 388 days on first iteration. */ return (acc + (a * b) / c); } /* * This step writes a * b as a * ((bh << s) + bl) = * a * (bh << s) + a * bl = (a << s) * bh + a * bl. The 2 * additive terms are handled separately. Splitting in * this way is linear except for rounding errors. * * s = 64 - sa is the maximum such that a << s fits in 64 * bits. Since a < c and c has at least 1 zero top bit, * sa < 64 and s > 0. Thus this step makes progress by * reducing b (it increases 'a', but taking remainders on * the next iteration completes the reduction). * * Finally, the choice for s is just what is needed to keep * a * bl from overflowing, so we don't need complications * like a recursive call mul64_by_fraction(a, bl, c) to * handle the second additive term. */ s = 64 - sa; bh = b >> s; bl = b - (bh << s); acc += (a * bl) / c; a <<= s; b = bh; } return (0); /* Algorithm failure -- can't happen. */ } static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp) { /* {user, system, interrupt, total} {ticks, usec}: */ uint64_t ut, uu, st, su, it, tt, tu; ut = ruxp->rux_uticks; st = ruxp->rux_sticks; it = ruxp->rux_iticks; tt = ut + st + it; if (tt == 0) { /* Avoid divide by zero */ st = 1; tt = 1; } tu = cputick2usec(ruxp->rux_runtime); if ((int64_t)tu < 0) { /* XXX: this should be an assert /phk */ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ruxp->rux_tu; } /* Subdivide tu. Avoid overflow in the multiplications. */ if (__predict_true(tu <= ((uint64_t)1 << 38) && tt <= (1 << 26))) { /* Up to 76 hours when stathz is 128. */ uu = (tu * ut) / tt; su = (tu * st) / tt; } else { uu = mul64_by_fraction(tu, ut, tt); su = mul64_by_fraction(tu, st, tt); } if (tu >= ruxp->rux_tu) { /* * The normal case, time increased. * Enforce monotonicity of bucketed numbers. */ if (uu < ruxp->rux_uu) uu = ruxp->rux_uu; if (su < ruxp->rux_su) su = ruxp->rux_su; } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) { /* * When we calibrate the cputicker, it is not uncommon to * see the presumably fixed frequency increase slightly over * time as a result of thermal stabilization and NTP * discipline (of the reference clock). We therefore ignore * a bit of backwards slop because we expect to catch up * shortly. We use a 3 microsecond limit to catch low * counts and a 1% limit for high counts. */ uu = ruxp->rux_uu; su = ruxp->rux_su; tu = ruxp->rux_tu; } else if (vm_guest == VM_GUEST_NO) { /* tu < ruxp->rux_tu */ /* * What happened here was likely that a laptop, which ran at * a reduced clock frequency at boot, kicked into high gear. * The wisdom of spamming this message in that case is * dubious, but it might also be indicative of something * serious, so lets keep it and hope laptops can be made * more truthful about their CPU speed via ACPI. */ printf("calcru: runtime went backwards from %ju usec " "to %ju usec for pid %d (%s)\n", (uintmax_t)ruxp->rux_tu, (uintmax_t)tu, p->p_pid, p->p_comm); } ruxp->rux_uu = uu; ruxp->rux_su = su; ruxp->rux_tu = tu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif int sys_getrusage(struct thread *td, struct getrusage_args *uap) { struct rusage ru; int error; error = kern_getrusage(td, uap->who, &ru); if (error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_getrusage(struct thread *td, int who, struct rusage *rup) { struct proc *p; int error; error = 0; p = td->td_proc; PROC_LOCK(p); switch (who) { case RUSAGE_SELF: rufetchcalc(p, rup, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_CHILDREN: *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_THREAD: PROC_STATLOCK(p); thread_lock(td); rufetchtd(td, rup); thread_unlock(td); PROC_STATUNLOCK(p); break; default: error = EINVAL; } PROC_UNLOCK(p); return (error); } void rucollect(struct rusage *ru, struct rusage *ru2) { long *ip, *ip2; int i; if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2) { rux->rux_runtime += rux2->rux_runtime; rux->rux_uticks += rux2->rux_uticks; rux->rux_sticks += rux2->rux_sticks; rux->rux_iticks += rux2->rux_iticks; rux->rux_uu += rux2->rux_uu; rux->rux_su += rux2->rux_su; rux->rux_tu += rux2->rux_tu; rucollect(ru, ru2); } /* * Aggregate tick counts into the proc's rusage_ext. */ static void ruxagg_ext_locked(struct rusage_ext *rux, struct thread *td) { rux->rux_runtime += td->td_incruntime; rux->rux_uticks += td->td_uticks; rux->rux_sticks += td->td_sticks; rux->rux_iticks += td->td_iticks; } void ruxagg_locked(struct proc *p, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); PROC_STATLOCK_ASSERT(td->td_proc, MA_OWNED); ruxagg_ext_locked(&p->p_rux, td); ruxagg_ext_locked(&td->td_rux, td); td->td_incruntime = 0; td->td_uticks = 0; td->td_iticks = 0; td->td_sticks = 0; } void ruxagg(struct proc *p, struct thread *td) { thread_lock(td); ruxagg_locked(p, td); thread_unlock(td); } /* * Update the rusage_ext structure and fetch a valid aggregate rusage * for proc p if storage for one is supplied. */ void rufetch(struct proc *p, struct rusage *ru) { struct thread *td; PROC_STATLOCK_ASSERT(p, MA_OWNED); *ru = p->p_ru; if (p->p_numthreads > 0) { FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); rucollect(ru, &td->td_ru); } } } /* * Atomically perform a rufetch and a calcru together. * Consumers, can safely assume the calcru is executed only once * rufetch is completed. */ void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp) { PROC_STATLOCK(p); rufetch(p, ru); calcru(p, up, sp); PROC_STATUNLOCK(p); } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc(void) { struct plimit *limp; limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); refcount_init(&limp->pl_refcnt, 1); return (limp); } struct plimit * lim_hold(struct plimit *limp) { refcount_acquire(&limp->pl_refcnt); return (limp); } struct plimit * lim_cowsync(void) { struct thread *td; struct proc *p; struct plimit *oldlimit; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_limit == p->p_limit) return (NULL); oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); return (oldlimit); } void lim_fork(struct proc *p1, struct proc *p2) { PROC_LOCK_ASSERT(p1, MA_OWNED); PROC_LOCK_ASSERT(p2, MA_OWNED); p2->p_limit = lim_hold(p1->p_limit); callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0); if (p1->p_cpulimit != RLIM_INFINITY) callout_reset_sbt(&p2->p_limco, SBT_1S, 0, lim_cb, p2, C_PREL(1)); } void lim_free(struct plimit *limp) { if (refcount_release(&limp->pl_refcnt)) free((void *)limp, M_PLIMIT); } void lim_freen(struct plimit *limp, int n) { if (refcount_releasen(&limp->pl_refcnt, n)) free((void *)limp, M_PLIMIT); } void limbatch_add(struct limbatch *lb, struct thread *td) { struct plimit *limp; MPASS(td->td_limit != NULL); limp = td->td_limit; if (lb->limp != limp) { if (lb->count != 0) { lim_freen(lb->limp, lb->count); lb->count = 0; } lb->limp = limp; } lb->count++; } void limbatch_final(struct limbatch *lb) { MPASS(lb->count != 0); lim_freen(lb->limp, lb->count); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(struct plimit *dst, struct plimit *src) { KASSERT(dst->pl_refcnt <= 1, ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_max); } rlim_t lim_max_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t (lim_cur)(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_cur); } rlim_t lim_cur_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct thread *td, int which, struct rlimit *rlp) { struct proc *p = td->td_proc; MPASS(td == curthread); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = td->td_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void uihashinit(void) { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_lock must be locked. * Increase refcount on uidinfo struct returned. */ static struct uidinfo * uilookup(uid_t uid) { struct uihashhead *uipp; struct uidinfo *uip; rw_assert(&uihashtbl_lock, RA_LOCKED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) { uihold(uip); break; } return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Returns with uidinfo struct referenced. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid_t uid) { struct uidinfo *new_uip, *uip; struct ucred *cred; cred = curthread->td_ucred; if (cred->cr_uidinfo->ui_uid == uid) { uip = cred->cr_uidinfo; uihold(uip); return (uip); } else if (cred->cr_ruidinfo->ui_uid == uid) { uip = cred->cr_ruidinfo; uihold(uip); return (uip); } rw_rlock(&uihashtbl_lock); uip = uilookup(uid); rw_runlock(&uihashtbl_lock); if (uip != NULL) return (uip); new_uip = malloc(sizeof(*new_uip), M_UIDINFO, M_WAITOK | M_ZERO); racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; rw_wlock(&uihashtbl_lock); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((uip = uilookup(uid)) == NULL) { LIST_INSERT_HEAD(UIHASH(uid), new_uip, ui_hash); rw_wunlock(&uihashtbl_lock); uip = new_uip; } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); free(new_uip, M_UIDINFO); } return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(struct uidinfo *uip) { refcount_acquire(&uip->ui_ref); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, lose the lock and acquire the locks in the proper * order to try again. */ void uifree(struct uidinfo *uip) { if (refcount_release_if_not_last(&uip->ui_ref)) return; rw_wlock(&uihashtbl_lock); if (refcount_release(&uip->ui_ref) == 0) { rw_wunlock(&uihashtbl_lock); return; } racct_destroy(&uip->ui_racct); LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); free(uip, M_UIDINFO); } #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct uidinfo *uip; struct uihashhead *uih; rw_rlock(&uihashtbl_lock); if (pre != NULL) (pre)(); for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } } if (post != NULL) (post)(); rw_runlock(&uihashtbl_lock); } #endif static inline int chglimit(struct uidinfo *uip, long *limit, int diff, rlim_t max, const char *name) { long new; /* Don't allow them to exceed max, but allow subtraction. */ new = atomic_fetchadd_long(limit, (long)diff) + diff; if (diff > 0 && max != 0) { if (new < 0 || new > max) { atomic_subtract_long(limit, (long)diff); return (0); } } else if (new < 0) printf("negative %s for uid = %d\n", name, uip->ui_uid); return (1); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_proccnt, diff, max, "proccnt")); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max) { int diff, rv; diff = to - *hiwat; if (diff > 0 && max == 0) { rv = 0; } else { rv = chglimit(uip, &uip->ui_sbsize, diff, max, "sbsize"); if (rv != 0) *hiwat = to; } return (rv); } /* * Change the count associated with number of pseudo-terminals * a given user is using. When 'max' is 0, don't enforce a limit */ int chgptscnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_ptscnt, diff, max, "ptscnt")); } int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_kqcnt, diff, max, "kqcnt")); } int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_umtxcnt, diff, max, "umtxcnt")); } + +int +chgpipecnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt")); +} diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 4c6f010a03f6..dc8b9ca2c9af 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1,1868 +1,1884 @@ /*- * Copyright (c) 1996 John S. Dyson * Copyright (c) 2012 Giovanni Trematerra * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, the sending process pins the underlying pages in * memory, and the receiving process copies directly from these pinned pages * in the sending process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ #define PIPE_PEER(pipe) \ (((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static fo_chmod_t pipe_chmod; static fo_chown_t pipe_chown; static fo_fill_kinfo_t pipe_fill_kinfo; struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_chmod = pipe_chmod, .fo_chown = pipe_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = pipe_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static void filt_pipedetach_notsup(struct knote *kn); static int filt_pipenotsup(struct knote *kn, long hint); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, .f_event = filt_pipenotsup }; static struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead }; static struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; static long pipe_mindirect = PIPE_MINDIRECT; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, bool backing); static int pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static void pipe_timestamp(struct timespec *tsp); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; static struct unrhdr64 pipeino_unr; static dev_t pipedev_ino; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); new_unrhdr64(&pipeino_unr, 1); pipedev_ino = devfs_alloc_cdp_inode(); KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); } static int sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS) { int error = 0; long tmp_pipe_mindirect = pipe_mindirect; error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req); if (error != 0 || req->newptr == NULL) return (error); /* * Don't allow pipe_mindirect to be set so low that we violate * atomicity requirements. */ if (tmp_pipe_mindirect <= PIPE_BUF) return (EINVAL); pipe_mindirect = tmp_pipe_mindirect; return (0); } SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW, &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L", "Minimum write size triggering VM optimization"); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); pipe_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } static int pipe_paircreate(struct thread *td, struct pipepair **p_pp) { struct pipepair *pp; struct pipe *rpipe, *wpipe; int error; *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; + pp->pp_owner = crhold(td->td_ucred); knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* * Only the forward direction pipe is backed by big buffer by * default. */ error = pipe_create(rpipe, true); if (error != 0) goto fail; error = pipe_create(wpipe, false); if (error != 0) { /* * This cleanup leaves the pipe inode number for rpipe * still allocated, but never used. We do not free * inode numbers for opened pipes, which is required * for correctness because numbers must be unique. * But also it avoids any memory use by the unr * allocator, so stashing away the transient inode * number is reasonable. */ pipe_free_kmem(rpipe); goto fail; } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; return (0); fail: knlist_destroy(&rpipe->pipe_sel.si_note); knlist_destroy(&wpipe->pipe_sel.si_note); + crfree(pp->pp_owner); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, pp); return (error); } int pipe_named_ctor(struct pipe **ppipe, struct thread *td) { struct pipepair *pp; int error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED; *ppipe = &pp->pp_rpipe; return (0); } void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); if (peer != NULL) { funsetown(&peer->pipe_sigio); pipeclose(peer); } } /* * Get a timestamp. * * This used to be vfs_timestamp but the higher precision is unnecessary and * can very negatively affect performance in virtualized environments (e.g., on * vms running on amd64 when using the rdtscp instruction). */ static void pipe_timestamp(struct timespec *tsp) { getnanotime(tsp); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, struct filecaps *fcaps2) { struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct pipepair *pp; int fd, fflags, error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; error = falloc_caps(td, &rf, &fd, flags, fcaps1); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc_caps(). */ fildes[0] = fd; fflags = FREAD | FWRITE; if ((flags & O_NONBLOCK) != 0) fflags |= FNONBLOCK; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); error = falloc_caps(td, &wf, &fd, flags, fcaps2); if (error) { fdclose(td, rf, fildes[0]); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc_caps(). */ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } #ifdef COMPAT_FREEBSD10 /* ARGSUSED */ int freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) { int error; int fildes[2]; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } #endif int sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) return (error); error = copyout(fildes, uap->fildes, 2 * sizeof(int)); if (error) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(struct pipe *cpipe, int size) { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); + if (!chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, + size, lim_cur(curthread, RLIMIT_PIPEBUF))) { + if (cpipe->pipe_buffer.buffer == NULL && + size > SMALL_PIPE_SIZE) { + size = SMALL_PIPE_SIZE; + goto retry; + } + return (ENOMEM); + } + error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0); if (error != KERN_SUCCESS) { + chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0); if (cpipe->pipe_buffer.buffer == NULL && size > SMALL_PIPE_SIZE) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(struct pipe *cpipe, int size) { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(struct pipe *cpipe, int catch) { int error, prio; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); prio = PRIBIO; if (catch) prio |= PCATCH; while (cpipe->pipe_state & PIPE_LOCKFL) { KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_waiters++; error = msleep(&cpipe->pipe_waiters, PIPE_MTX(cpipe), prio, "pipelk", 0); cpipe->pipe_waiters--; if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_waiters > 0) wakeup_one(&cpipe->pipe_waiters); } void pipeselwakeup(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(struct pipe *pipe, bool large_backing) { int error; error = pipespace_new(pipe, !large_backing || amountpipekva > maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE); if (error == 0) pipe->pipe_ino = alloc_unr64(&pipeino_unr); return (error); } /* ARGSUSED */ static int pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *rpipe; int error; int nread = 0; int size; rpipe = fp->f_data; /* * Try to avoid locking the pipe if we have nothing to do. * * There are programs which share one pipe amongst multiple processes * and perform non-blocking reads in parallel, even if the pipe is * empty. This in particular is the case with BSD make, which when * spawned with a high -j number can find itself with over half of the * calls failing to find anything. */ if ((fp->f_flag & FNONBLOCK) != 0 && !mac_pipe_check_read_enabled()) { if (__predict_false(uio->uio_resid == 0)) return (0); if ((atomic_load_short(&rpipe->pipe_state) & PIPE_EOF) == 0 && atomic_load_int(&rpipe->pipe_buffer.cnt) == 0 && atomic_load_int(&rpipe->pipe_pages.cnt) == 0) return (EAGAIN); } PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 && rpipe->pipe_buffer.size > SMALL_PIPE_SIZE && rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > uio->uio_resid) size = uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_pages.cnt) != 0) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_pages.ms, rpipe->pipe_pages.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_pages.pos += size; rpipe->pipe_pages.cnt -= size; if (rpipe->pipe_pages.cnt == 0) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) pipe_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } /* * Only wake up writers if there was actually something read. * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs. */ if (nread > 0 && rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); if (nread > 0) td->td_ru.ru_msgrcv++; return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) { u_int size; int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, wpipe->pipe_pages.ms, PIPENPAGES); PIPE_LOCK(wpipe); if (i < 0) { wpipe->pipe_state &= ~PIPE_DIRECTW; return (EFAULT); } wpipe->pipe_pages.npages = i; wpipe->pipe_pages.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_pages.cnt = size; uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; } uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * Unwire the process buffer. */ static void pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); wpipe->pipe_state &= ~PIPE_DIRECTW; vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages); wpipe->pipe_pages.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(struct pipe *wpipe) { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); size = wpipe->pipe_pages.cnt; pos = wpipe->pipe_pages.pos; wpipe->pipe_pages.cnt = 0; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(struct pipe *wpipe, struct uio *uio) { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if ((wpipe->pipe_state & PIPE_EOF) != 0) { error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } error = pipe_build_write_buffer(wpipe, uio); if (error) { goto error1; } while (wpipe->pipe_pages.cnt != 0 && (wpipe->pipe_state & PIPE_EOF) == 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); if (error != 0) break; } if ((wpipe->pipe_state & PIPE_EOF) != 0) { wpipe->pipe_pages.cnt = 0; pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); error = EPIPE; } else if (error == EINTR || error == ERESTART) { pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("pipe %p leaked PIPE_DIRECTW", wpipe)); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *wpipe, *rpipe; ssize_t orig_resid; int desiredsize, error; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if (amountpipekva > (3 * maxpipekva) / 4 && wpipe->pipe_buffer.size > SMALL_PIPE_SIZE && wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if (desiredsize != wpipe->pipe_buffer.size && (wpipe->pipe_state & PIPE_DIRECTW) == 0) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } MPASS(wpipe->pipe_buffer.size != 0); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= pipe_mindirect && wpipe->pipe_buffer.size >= pipe_mindirect && (fp->f_flag & FNONBLOCK) == 0) { error = pipe_direct_write(wpipe, uio); if (error != 0) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_pages.cnt != 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } if (error != 0) break; continue; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if any byte was written. * EINTR and other interrupts are handled by generic I/O layer. * Do not pretend that I/O succeeded for obvious user error * like EFAULT. */ if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; if (error == 0) pipe_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); if (uio->uio_resid != orig_resid) td->td_ru.ru_msgsnd++; return (error); } /* ARGSUSED */ static int pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vnops.fo_truncate(fp, length, active_cred, td); else error = invfo_truncate(fp, length, active_cred, td); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (!(fp->f_flag & FREAD)) { *(int *)data = 0; PIPE_UNLOCK(mpipe); return (0); } if (mpipe->pipe_pages.cnt != 0) *(int *)data = mpipe->pipe_pages.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct pipe *rpipe; struct pipe *wpipe; int levents, revents; #ifdef MAC int error; #endif revents = 0; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0) revents |= events & (POLLIN | POLLRDNORM); if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents && fp->f_pipegen == rpipe->pipe_wgen) events |= POLLINIGNEOF; if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { if (fp->f_flag & FREAD) revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { /* * Add ourselves regardless of eventmask as we have to return * POLLHUP even if it was not asked for. */ if ((fp->f_flag & FREAD) != 0) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if ((fp->f_flag & FWRITE) != 0 && wpipe->pipe_present == PIPE_ACTIVE) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct pipe *pipe; #ifdef MAC int error; #endif pipe = fp->f_data; #ifdef MAC if (mac_pipe_check_stat_enabled()) { PIPE_LOCK(pipe); error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) { return (error); } } #endif /* For named pipes ask the underlying filesystem. */ if (pipe->pipe_type & PIPE_TYPE_NAMED) { return (vnops.fo_stat(fp, ub, active_cred)); } bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_pages.cnt != 0) ub->st_size = pipe->pipe_pages.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = howmany(ub->st_size, ub->st_blksize); ub->st_atim = pipe->pipe_atime; ub->st_mtim = pipe->pipe_mtime; ub->st_ctim = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; ub->st_dev = pipedev_ino; ub->st_ino = pipe->pipe_ino; /* * Left as 0: st_nlink, st_rdev, st_flags, st_gen. */ return (0); } /* ARGSUSED */ static int pipe_close(struct file *fp, struct thread *td) { if (fp->f_vnode != NULL) return vnops.fo_close(fp, td); fp->f_ops = &badfileops; pipe_dtor(fp->f_data); fp->f_data = NULL; return (0); } static int pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chmod(fp, mode, active_cred, td); else error = invfo_chmod(fp, mode, active_cred, td); return (error); } static int pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chown(fp, uid, gid, active_cred, td); else error = invfo_chown(fp, uid, gid, active_cred, td); return (error); } static int pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct pipe *pi; if (fp->f_type == DTYPE_FIFO) return (vn_fill_kinfo(fp, kif, fdp)); kif->kf_type = KF_TYPE_PIPE; pi = fp->f_data; kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; kif->kf_un.kf_pipe.kf_pipe_buffer_in = pi->pipe_buffer.in; kif->kf_un.kf_pipe.kf_pipe_buffer_out = pi->pipe_buffer.out; kif->kf_un.kf_pipe.kf_pipe_buffer_size = pi->pipe_buffer.size; return (0); } static void pipe_free_kmem(struct pipe *cpipe) { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); + chgpipecnt(cpipe->pipe_pair->pp_owner->cr_uidinfo, + -cpipe->pipe_buffer.size, 0); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_pages.cnt = 0; cpipe->pipe_pages.pos = 0; cpipe->pipe_pages.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(struct pipe *cpipe) { #ifdef MAC struct pipepair *pp; #endif struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); #ifdef MAC pp = cpipe->pipe_pair; #endif /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } pipeselwakeup(cpipe); /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); pipeselwakeup(ppipe); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); + crfree(cpipe->pipe_pair->pp_owner); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &pipe_nfiltops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &pipe_nfiltops; return (0); } cpipe = fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = PIPE_PEER(cpipe); break; default: if ((cpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { PIPE_UNLOCK(cpipe); return (vnops.fo_kqfilter(fp, kn)); } PIPE_UNLOCK(cpipe); return (EINVAL); } kn->kn_hook = cpipe; knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = kn->kn_hook; PIPE_LOCK(cpipe); knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct pipe *rpipe = kn->kn_hook; PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; if (kn->kn_data == 0) kn->kn_data = rpipe->pipe_pages.cnt; if ((rpipe->pipe_state & PIPE_EOF) != 0 && ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 || fp->f_pipegen != rpipe->pipe_wgen)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe = kn->kn_hook; /* * If this end of the pipe is closed, the knote was removed from the * knlist and the list lock (i.e., the pipe lock) is therefore not held. */ if (wpipe->pipe_present == PIPE_ACTIVE || (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_state & PIPE_DIRECTW) { kn->kn_data = 0; } else if (wpipe->pipe_buffer.size > 0) { kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; } else { kn->kn_data = PIPE_BUF; } } if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= PIPE_BUF); } static void filt_pipedetach_notsup(struct knote *kn) { } static int filt_pipenotsup(struct knote *kn, long hint) { return (0); } diff --git a/sys/sys/pipe.h b/sys/sys/pipe.h index 61fb57480605..0f35316432eb 100644 --- a/sys/sys/pipe.h +++ b/sys/sys/pipe.h @@ -1,151 +1,152 @@ /*- * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. This work was done expressly for inclusion into FreeBSD. Other use * is allowed if this notation is included. * 5. Modifications may be freely made to this file if the above conditions * are met. */ #ifndef _SYS_PIPE_H_ #define _SYS_PIPE_H_ /* * Pipe buffer size, keep moderate in value, pipes take kva space. */ #ifndef PIPE_SIZE #define PIPE_SIZE 16384 #endif #ifndef BIG_PIPE_SIZE #define BIG_PIPE_SIZE (64*1024) #endif #ifndef SMALL_PIPE_SIZE #define SMALL_PIPE_SIZE PAGE_SIZE #endif /* * PIPE_MINDIRECT MUST be smaller than PIPE_SIZE and MUST be bigger * than PIPE_BUF. */ #ifndef PIPE_MINDIRECT #define PIPE_MINDIRECT 8192 #endif #define PIPENPAGES (BIG_PIPE_SIZE / PAGE_SIZE + 1) #ifdef _KERNEL /* * See sys_pipe.c for info on what these limits mean. */ extern long maxpipekva; extern struct fileops pipeops; #endif /* * Pipe buffer information. * Separate in, out, cnt are used to simplify calculations. * Buffered write is active when the buffer.cnt field is set. */ struct pipebuf { u_int cnt; /* number of chars currently in buffer */ u_int in; /* in pointer */ u_int out; /* out pointer */ u_int size; /* size of buffer */ caddr_t buffer; /* kva of buffer */ }; /* * Information to support direct transfers between processes for pipes. */ struct pipemapping { u_int cnt; /* number of chars in buffer */ u_int pos; /* current position of transfer */ int npages; /* number of pages */ vm_page_t ms[PIPENPAGES]; /* pages in source process */ }; /* * Bits in pipe_state. */ #define PIPE_ASYNC 0x004 /* Async? I/O. */ #define PIPE_WANTR 0x008 /* Reader wants some characters. */ #define PIPE_WANTW 0x010 /* Writer wants space to put characters. */ #define PIPE_WANT 0x020 /* Pipe is wanted to be run-down. */ #define PIPE_SEL 0x040 /* Pipe has a select active. */ #define PIPE_EOF 0x080 /* Pipe is in EOF condition. */ #define PIPE_LOCKFL 0x100 /* Process has exclusive access to pointers/data. */ #define PIPE_LWANT 0x200 /* Process wants exclusive access to pointers/data. */ #define PIPE_DIRECTW 0x400 /* Pipe direct write active. */ #define PIPE_DIRECTOK 0x800 /* Direct mode ok. */ /* * Bits in pipe_type. */ #define PIPE_TYPE_NAMED 0x001 /* Is a named pipe. */ /* * Per-pipe data structure. * Two of these are linked together to produce bi-directional pipes. */ struct pipe { struct pipebuf pipe_buffer; /* data storage */ struct pipemapping pipe_pages; /* wired pages for direct I/O */ struct selinfo pipe_sel; /* for compat with select */ struct timespec pipe_atime; /* time of last access */ struct timespec pipe_mtime; /* time of last modify */ struct timespec pipe_ctime; /* time of status change */ struct sigio *pipe_sigio; /* information for async I/O */ struct pipe *pipe_peer; /* link with other direction */ struct pipepair *pipe_pair; /* container structure pointer */ u_short pipe_state; /* pipe status info */ u_char pipe_type; /* pipe type info */ u_char pipe_present; /* still present? */ int pipe_waiters; /* pipelock waiters */ int pipe_busy; /* busy flag, mostly to handle rundown sanely */ int pipe_wgen; /* writer generation for named pipe */ ino_t pipe_ino; /* fake inode for stat(2) */ }; /* * Values for the pipe_present. */ #define PIPE_ACTIVE 1 #define PIPE_CLOSING 2 #define PIPE_FINALIZED 3 /* * Container structure to hold the two pipe endpoints, mutex, and label * pointer. */ struct pipepair { struct pipe pp_rpipe; struct pipe pp_wpipe; struct mtx pp_mtx; struct label *pp_label; + struct ucred *pp_owner; /* to dec pipe usage count */ }; #define PIPE_MTX(pipe) (&(pipe)->pipe_pair->pp_mtx) #define PIPE_LOCK(pipe) mtx_lock(PIPE_MTX(pipe)) #define PIPE_UNLOCK(pipe) mtx_unlock(PIPE_MTX(pipe)) #define PIPE_LOCK_ASSERT(pipe, type) mtx_assert(PIPE_MTX(pipe), (type)) #ifdef _KERNEL void pipe_dtor(struct pipe *dpipe); int pipe_named_ctor(struct pipe **ppipe, struct thread *td); void pipeselwakeup(struct pipe *cpipe); #endif #endif /* !_SYS_PIPE_H_ */ diff --git a/sys/sys/resource.h b/sys/sys/resource.h index 5100501fb2b8..3b582e83d553 100644 --- a/sys/sys/resource.h +++ b/sys/sys/resource.h @@ -1,193 +1,194 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)resource.h 8.4 (Berkeley) 1/9/95 */ #ifndef _SYS_RESOURCE_H_ #define _SYS_RESOURCE_H_ #include #include #include #ifndef _ID_T_DECLARED typedef __id_t id_t; #define _ID_T_DECLARED #endif #ifndef _RLIM_T_DECLARED typedef __rlim_t rlim_t; #define _RLIM_T_DECLARED #endif /* * Process priority specifications to get/setpriority. */ #define PRIO_MIN -20 #define PRIO_MAX 20 #define PRIO_PROCESS 0 #define PRIO_PGRP 1 #define PRIO_USER 2 /* * Resource utilization information. * * All fields are only modified by curthread and * no locks are required to read. */ #define RUSAGE_SELF 0 #define RUSAGE_CHILDREN -1 #define RUSAGE_THREAD 1 struct rusage { struct timeval ru_utime; /* user time used */ struct timeval ru_stime; /* system time used */ long ru_maxrss; /* max resident set size */ #define ru_first ru_ixrss long ru_ixrss; /* integral shared memory size */ long ru_idrss; /* integral unshared data " */ long ru_isrss; /* integral unshared stack " */ long ru_minflt; /* page reclaims */ long ru_majflt; /* page faults */ long ru_nswap; /* swaps */ long ru_inblock; /* block input operations */ long ru_oublock; /* block output operations */ long ru_msgsnd; /* messages sent */ long ru_msgrcv; /* messages received */ long ru_nsignals; /* signals received */ long ru_nvcsw; /* voluntary context switches */ long ru_nivcsw; /* involuntary " */ #define ru_last ru_nivcsw }; #if __BSD_VISIBLE struct __wrusage { struct rusage wru_self; struct rusage wru_children; }; #endif /* * Resource limits */ #define RLIMIT_CPU 0 /* maximum cpu time in seconds */ #define RLIMIT_FSIZE 1 /* maximum file size */ #define RLIMIT_DATA 2 /* data size */ #define RLIMIT_STACK 3 /* stack size */ #define RLIMIT_CORE 4 /* core file size */ #define RLIMIT_RSS 5 /* resident set size */ #define RLIMIT_MEMLOCK 6 /* locked-in-memory address space */ #define RLIMIT_NPROC 7 /* number of processes */ #define RLIMIT_NOFILE 8 /* number of open files */ #define RLIMIT_SBSIZE 9 /* maximum size of all socket buffers */ #define RLIMIT_VMEM 10 /* virtual process size (incl. mmap) */ #define RLIMIT_AS RLIMIT_VMEM /* standard name for RLIMIT_VMEM */ #define RLIMIT_NPTS 11 /* pseudo-terminals */ #define RLIMIT_SWAP 12 /* swap used */ #define RLIMIT_KQUEUES 13 /* kqueues allocated */ #define RLIMIT_UMTXP 14 /* process-shared umtx */ +#define RLIMIT_PIPEBUF 15 /* pipes/fifos buffers */ -#define RLIM_NLIMITS 15 /* number of resource limits */ +#define RLIM_NLIMITS 16 /* number of resource limits */ #define RLIM_INFINITY ((rlim_t)(((__uint64_t)1 << 63) - 1)) #define RLIM_SAVED_MAX RLIM_INFINITY #define RLIM_SAVED_CUR RLIM_INFINITY /* * Resource limit string identifiers */ #ifdef _RLIMIT_IDENT static const char *rlimit_ident[RLIM_NLIMITS] = { "cpu", "fsize", "data", "stack", "core", "rss", "memlock", "nproc", "nofile", "sbsize", "vmem", "npts", "swap", "kqueues", "umtx", }; #endif struct rlimit { rlim_t rlim_cur; /* current (soft) limit */ rlim_t rlim_max; /* maximum value for rlim_cur */ }; #if __BSD_VISIBLE struct orlimit { __int32_t rlim_cur; /* current (soft) limit */ __int32_t rlim_max; /* maximum value for rlim_cur */ }; struct loadavg { __fixpt_t ldavg[3]; long fscale; }; #define CP_USER 0 #define CP_NICE 1 #define CP_SYS 2 #define CP_INTR 3 #define CP_IDLE 4 #define CPUSTATES 5 #endif /* __BSD_VISIBLE */ #ifdef _KERNEL extern struct loadavg averunnable; void read_cpu_time(long *cp_time); /* Writes array of CPUSTATES */ #else __BEGIN_DECLS /* XXX 2nd arg to [gs]etpriority() should be an id_t */ int getpriority(int, int); int getrlimit(int, struct rlimit *); int getrusage(int, struct rusage *); int setpriority(int, int, int); int setrlimit(int, const struct rlimit *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_RESOURCE_H_ */ diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h index 49119651665e..8449201dee67 100644 --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -1,199 +1,201 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)resourcevar.h 8.4 (Berkeley) 1/9/95 */ #ifndef _SYS_RESOURCEVAR_H_ #define _SYS_RESOURCEVAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Kernel per-process accounting / statistics * (not necessarily resident except when running). * * Locking key: * b - created at fork, never changes * c - locked by proc mtx * k - only accessed by curthread * w - locked by proc itim lock * w2 - locked by proc prof lock */ struct pstats { #define pstat_startzero p_cru struct rusage p_cru; /* Stats for reaped children. */ struct itimerval p_timer[3]; /* (w) Virtual-time timers. */ #define pstat_endzero pstat_startcopy #define pstat_startcopy p_prof struct uprof { /* Profile arguments. */ caddr_t pr_base; /* (c + w2) Buffer base. */ u_long pr_size; /* (c + w2) Buffer size. */ u_long pr_off; /* (c + w2) PC offset. */ u_long pr_scale; /* (c + w2) PC scaling. */ } p_prof; #define pstat_endcopy p_start struct timeval p_start; /* (b) Starting time. */ }; #ifdef _KERNEL /* * Kernel shareable process resource limits. Because this structure * is moderately large but changes infrequently, it is normally * shared copy-on-write after forks. */ struct plimit { struct rlimit pl_rlimit[RLIM_NLIMITS]; int pl_refcnt; /* number of references */ }; struct limbatch { struct plimit *limp; int count; }; static inline void limbatch_prep(struct limbatch *lb) { lb->limp = NULL; lb->count = 0; } void limbatch_add(struct limbatch *lb, struct thread *td); static inline void limbatch_process(struct limbatch *lb __unused) { } void limbatch_final(struct limbatch *lb); struct racct; /*- * Per uid resource consumption. This structure is used to track * the total resource consumption (process count, socket buffer size, * etc) for the uid and impose limits. * * Locking guide: * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_lock */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ u_long ui_vmsize; /* (b) pages of swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ long ui_kqcnt; /* (b) number of kqueues */ long ui_umtxcnt; /* (b) number of shared umtxs */ + long ui_pipecnt; /* (b) consumption of pipe buffers */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ #ifdef RACCT struct racct *ui_racct; /* (a) resource accounting */ #endif }; struct proc; struct rusage_ext; struct thread; void addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks); void addupc_task(struct thread *td, uintfptr_t pc, u_int ticks); void calccru(struct proc *p, struct timeval *up, struct timeval *sp); void calcru(struct proc *p, struct timeval *up, struct timeval *sp); int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max); int chgproccnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t maxval); int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval); +int chgpipecnt(struct uidinfo *uip, int diff, rlim_t max); int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp); struct plimit *lim_alloc(void); void lim_copy(struct plimit *dst, struct plimit *src); rlim_t lim_cur(struct thread *td, int which); #define lim_cur(td, which) ({ \ rlim_t _rlim; \ struct thread *_td = (td); \ int _which = (which); \ if (__builtin_constant_p(which) && which != RLIMIT_DATA && \ which != RLIMIT_STACK && which != RLIMIT_VMEM) { \ _rlim = _td->td_limit->pl_rlimit[_which].rlim_cur; \ } else { \ _rlim = lim_cur(_td, _which); \ } \ _rlim; \ }) rlim_t lim_cur_proc(struct proc *p, int which); void lim_fork(struct proc *p1, struct proc *p2); void lim_free(struct plimit *limp); void lim_freen(struct plimit *limp, int n); struct plimit *lim_hold(struct plimit *limp); struct plimit *lim_cowsync(void); rlim_t lim_max(struct thread *td, int which); rlim_t lim_max_proc(struct proc *p, int which); void lim_rlimit(struct thread *td, int which, struct rlimit *rlp); void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp); void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2); void rucollect(struct rusage *ru, struct rusage *ru2); void rufetch(struct proc *p, struct rusage *ru); void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp); void rufetchtd(struct thread *td, struct rusage *ru); void ruxagg(struct proc *p, struct thread *td); void ruxagg_locked(struct proc *p, struct thread *td); struct uidinfo *uifind(uid_t uid); void uifree(struct uidinfo *uip); void uihashinit(void); void uihold(struct uidinfo *uip); #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3); #endif #endif /* _KERNEL */ #endif /* !_SYS_RESOURCEVAR_H_ */