diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index 877a25ffa4cd..6ace3ea52c64 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -1,1397 +1,1397 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/ktrace.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The ktrace facility allows the tracing of certain key events in user space
  * processes, such as system calls, signal delivery, context switches, and
  * user generated events using utrace(2).  It works by streaming event
  * records and data to a vnode associated with the process using the
  * ktrace(2) system call.  In general, records can be written directly from
  * the context that generates the event.  One important exception to this is
  * during a context switch, where sleeping is not permitted.  To handle this
  * case, trace events are generated using in-kernel ktr_request records, and
  * then delivered to disk at a convenient moment -- either immediately, the
  * next traceable event, at system call return, or at process exit.
  *
  * When dealing with multiple threads or processes writing to the same event
  * log, ordering guarantees are weak: specifically, if an event has multiple
  * records (i.e., system call enter and return), they may be interlaced with
  * records from another event.  Process and thread ID information is provided
  * in the record, and user applications can de-interlace events if required.
  */
 
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
 
 FEATURE(ktrace, "Kernel support for system-call tracing");
 
 #ifndef KTRACE_REQUEST_POOL
 #define	KTRACE_REQUEST_POOL	100
 #endif
 
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
 	union {
 		struct	ktr_proc_ctor ktr_proc_ctor;
 		struct	ktr_cap_fail ktr_cap_fail;
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
 		struct	ktr_genio ktr_genio;
 		struct	ktr_psig ktr_psig;
 		struct	ktr_csw ktr_csw;
 		struct	ktr_fault ktr_fault;
 		struct	ktr_faultend ktr_faultend;
 		struct  ktr_struct_array ktr_struct_array;
 	} ktr_data;
 	STAILQ_ENTRY(ktr_request) ktr_list;
 };
 
 static const int data_lengths[] = {
 	[KTR_SYSCALL] = offsetof(struct ktr_syscall, ktr_args),
 	[KTR_SYSRET] = sizeof(struct ktr_sysret),
 	[KTR_NAMEI] = 0,
 	[KTR_GENIO] = sizeof(struct ktr_genio),
 	[KTR_PSIG] = sizeof(struct ktr_psig),
 	[KTR_CSW] = sizeof(struct ktr_csw),
 	[KTR_USER] = 0,
 	[KTR_STRUCT] = 0,
 	[KTR_SYSCTL] = 0,
 	[KTR_PROCCTOR] = sizeof(struct ktr_proc_ctor),
 	[KTR_PROCDTOR] = 0,
 	[KTR_CAPFAIL] = sizeof(struct ktr_cap_fail),
 	[KTR_FAULT] = sizeof(struct ktr_fault),
 	[KTR_FAULTEND] = sizeof(struct ktr_faultend),
 	[KTR_STRUCT_ARRAY] = sizeof(struct ktr_struct_array),
 };
 
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "KTRACE options");
 
 static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
 TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
 
 u_int ktr_geniosize = PAGE_SIZE;
 SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RWTUN, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 /*
  * Allow to not to send signal to traced process, in which context the
  * ktr record is written.  The limit is applied from the process that
  * set up ktrace, so killing the traced process is not completely fair.
  */
 int ktr_filesize_limit_signal = 0;
 SYSCTL_INT(_kern_ktrace, OID_AUTO, filesize_limit_signal, CTLFLAG_RWTUN,
     &ktr_filesize_limit_signal, 0,
     "Send SIGXFSZ to the traced process when the log size limit is exceeded");
 
 static int print_message = 1;
 static struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 struct ktr_io_params {
 	struct vnode	*vp;
 	struct ucred	*cr;
 	off_t		lim;
 	u_int		refs;
 };
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
 static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static struct ktr_io_params *ktr_freeproc(struct proc *p);
 static void ktr_freerequest(struct ktr_request *req);
 static void ktr_freerequest_locked(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *, struct proc *, int, int,
     struct ktr_io_params *);
 static int ktrops(struct thread *, struct proc *, int, int,
     struct ktr_io_params *);
 static void ktrprocctor_entered(struct thread *, struct proc *);
 
 /*
  * ktrace itself generates events, such as context switches, which we do not
  * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
  * whether or not it is in a region where tracing of events should be
  * suppressed.
  */
 static void
 ktrace_enter(struct thread *td)
 {
 
 	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
 	td->td_pflags |= TDP_INKTRACE;
 }
 
 static void
 ktrace_exit(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
 	td->td_pflags &= ~TDP_INKTRACE;
 }
 
 static void
 ktrace_assert(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
 }
 
 static void
 ast_ktrace(struct thread *td, int tda __unused)
 {
 	KTRUSERRET(td);
 }
 
 static void
 ktrace_init(void *dummy)
 {
 	struct ktr_request *req;
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
 	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK |
 		    M_ZERO);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
 	ast_register(TDA_KTRACE, ASTR_ASTF_REQUIRED, 0, ast_ktrace);
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
 static int
 sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td;
 	u_int newsize, oldsize, wantsize;
 	int error;
 
 	/* Handle easy read-only case first to avoid warnings from GCC. */
 	if (!req->newptr) {
 		oldsize = ktr_requestpool;
 		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
 	}
 
 	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
 	if (error)
 		return (error);
 	td = curthread;
 	ktrace_enter(td);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(oldsize, wantsize);
 	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
 	if (wantsize > oldsize && newsize < wantsize)
 		return (ENOSPC);
 	return (0);
 }
 SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &ktr_requestpool, 0,
     sysctl_kern_ktrace_request_pool, "IU",
     "Pool buffer size for ktrace(1)");
 
 static u_int
 ktrace_resize_pool(u_int oldsize, u_int newsize)
 {
 	STAILQ_HEAD(, ktr_request) ktr_new;
 	struct ktr_request *req;
 	int bound;
 
 	print_message = 1;
 	bound = newsize - oldsize;
 	if (bound == 0)
 		return (ktr_requestpool);
 	if (bound < 0) {
 		mtx_lock(&ktrace_mtx);
 		/* Shrink pool down to newsize if possible. */
 		while (bound++ < 0) {
 			req = STAILQ_FIRST(&ktr_free);
 			if (req == NULL)
 				break;
 			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 			ktr_requestpool--;
 			free(req, M_KTRACE);
 		}
 	} else {
 		/* Grow pool up to newsize. */
 		STAILQ_INIT(&ktr_new);
 		while (bound-- > 0) {
 			req = malloc(sizeof(struct ktr_request), M_KTRACE,
 			    M_WAITOK | M_ZERO);
 			STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
 		}
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&ktr_free, &ktr_new);
 		ktr_requestpool += (newsize - oldsize);
 	}
 	mtx_unlock(&ktrace_mtx);
 	return (ktr_requestpool);
 }
 
 /* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
 CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
     (sizeof((struct thread *)NULL)->td_name));
 
 static struct ktr_request *
 ktr_getrequest_entered(struct thread *td, int type)
 {
 	struct ktr_request *req;
 	struct proc *p = td->td_proc;
 	int pm;
 
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
 	if (req != NULL) {
 		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 		req->ktr_header.ktr_type = type;
 		if (p->p_traceflag & KTRFAC_DROP) {
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
 		mtx_unlock(&ktrace_mtx);
 		nanotime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_type |= KTR_VERSIONED;
 		req->ktr_header.ktr_pid = p->p_pid;
 		req->ktr_header.ktr_tid = td->td_tid;
 		req->ktr_header.ktr_cpu = PCPU_GET(cpuid);
 		req->ktr_header.ktr_version = KTR_VERSION1;
 		bcopy(td->td_name, req->ktr_header.ktr_comm,
 		    sizeof(req->ktr_header.ktr_comm));
 		req->ktr_buffer = NULL;
 		req->ktr_header.ktr_len = 0;
 	} else {
 		p->p_traceflag |= KTRFAC_DROP;
 		pm = print_message;
 		print_message = 0;
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
 	}
 	return (req);
 }
 
 static struct ktr_request *
 ktr_getrequest(int type)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 
 	ktrace_enter(td);
 	req = ktr_getrequest_entered(td, type);
 	if (req == NULL)
 		ktrace_exit(td);
 
 	return (req);
 }
 
 /*
  * Some trace generation environments don't permit direct access to VFS,
  * such as during a context switch where sleeping is not allowed.  Under these
  * circumstances, queue a request to the thread to be written asynchronously
  * later.
  */
 static void
 ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 	ast_sched(td, TDA_KTRACE);
 }
 
 /*
  * Drain any pending ktrace records from the per-thread queue to disk.  This
  * is used both internally before committing other records, and also on
  * system call return.  We drain all the ones we can find at the time when
  * drain is requested, but don't keep draining after that as those events
  * may be approximately "after" the current event.
  */
 static void
 ktr_drain(struct thread *td)
 {
 	struct ktr_request *queued_req;
 	STAILQ_HEAD(, ktr_request) local_queue;
 
 	ktrace_assert(td);
 	sx_assert(&ktrace_sx, SX_XLOCKED);
 
 	STAILQ_INIT(&local_queue);
 
 	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
 
 		while ((queued_req = STAILQ_FIRST(&local_queue))) {
 			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
 			ktr_writerequest(td, queued_req);
 			ktr_freerequest(queued_req);
 		}
 	}
 }
 
 /*
  * Submit a trace record for immediate commit to disk -- to be used only
  * where entering VFS is OK.  First drain any pending records that may have
  * been cached in the thread.
  */
 static void
 ktr_submitrequest(struct thread *td, struct ktr_request *req)
 {
 
 	ktrace_assert(td);
 
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	ktr_writerequest(td, req);
 	ktr_freerequest(req);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 static void
 ktr_freerequest(struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	ktr_freerequest_locked(req);
 	mtx_unlock(&ktrace_mtx);
 }
 
 static void
 ktr_freerequest_locked(struct ktr_request *req)
 {
 
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 }
 
 static void
 ktr_io_params_ref(struct ktr_io_params *kiop)
 {
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	kiop->refs++;
 }
 
 static struct ktr_io_params *
 ktr_io_params_rele(struct ktr_io_params *kiop)
 {
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	if (kiop == NULL)
 		return (NULL);
 	KASSERT(kiop->refs > 0, ("kiop ref == 0 %p", kiop));
 	return (--(kiop->refs) == 0 ? kiop : NULL);
 }
 
 void
 ktr_io_params_free(struct ktr_io_params *kiop)
 {
 	if (kiop == NULL)
 		return;
 
 	MPASS(kiop->refs == 0);
 	vn_close(kiop->vp, FWRITE, kiop->cr, curthread);
 	crfree(kiop->cr);
 	free(kiop, M_KTRACE);
 }
 
 static struct ktr_io_params *
 ktr_io_params_alloc(struct thread *td, struct vnode *vp)
 {
 	struct ktr_io_params *res;
 
 	res = malloc(sizeof(struct ktr_io_params), M_KTRACE, M_WAITOK);
 	res->vp = vp;
 	res->cr = crhold(td->td_ucred);
 	res->lim = lim_cur(td, RLIMIT_FSIZE);
 	res->refs = 1;
 	return (res);
 }
 
 /*
  * Disable tracing for a process and release all associated resources.
  * The caller is responsible for releasing a reference on the returned
  * vnode and credentials.
  */
 static struct ktr_io_params *
 ktr_freeproc(struct proc *p)
 {
 	struct ktr_io_params *kiop;
 	struct ktr_request *req;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	kiop = ktr_io_params_rele(p->p_ktrioparms);
 	p->p_ktrioparms = NULL;
 	p->p_traceflag = 0;
 	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
 		STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
 		ktr_freerequest_locked(req);
 	}
 	return (kiop);
 }
 
 struct vnode *
 ktr_get_tracevp(struct proc *p, bool ref)
 {
 	struct vnode *vp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_ktrioparms != NULL) {
 		vp = p->p_ktrioparms->vp;
 		if (ref)
 			vrefact(vp);
 	} else {
 		vp = NULL;
 	}
 	return (vp);
 }
 
 void
 ktrsyscall(int code, int narg, syscallarg_t args[])
 {
 	struct ktr_request *req;
 	struct ktr_syscall *ktp;
 	size_t buflen;
 	char *buf = NULL;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	buflen = sizeof(register_t) * narg;
 	if (buflen > 0) {
 		buf = malloc(buflen, M_KTRACE, M_WAITOK);
 		bcopy(args, buf, buflen);
 	}
 	req = ktr_getrequest(KTR_SYSCALL);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	ktp = &req->ktr_data.ktr_syscall;
 	ktp->ktr_code = code;
 	ktp->ktr_narg = narg;
 	if (buflen > 0) {
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysret(int code, int error, register_t retval)
 {
 	struct ktr_request *req;
 	struct ktr_sysret *ktp;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_SYSRET);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_sysret;
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = ((error == 0) ? retval: 0);		/* what about val2 ? */
 	ktr_submitrequest(curthread, req);
 }
 
 /*
  * When a setuid process execs, disable tracing.
  *
  * XXX: We toss any pending asynchronous records.
  */
 struct ktr_io_params *
 ktrprocexec(struct proc *p)
 {
 	struct ktr_io_params *kiop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	kiop = p->p_ktrioparms;
 	if (kiop == NULL || priv_check_cred(kiop->cr, PRIV_DEBUG_DIFFCRED))
 		return (NULL);
 
 	mtx_lock(&ktrace_mtx);
 	kiop = ktr_freeproc(p);
 	mtx_unlock(&ktrace_mtx);
 	return (kiop);
 }
 
 /*
  * When a process exits, drain per-process asynchronous trace records
  * and disable tracing.
  */
 void
 ktrprocexit(struct thread *td)
 {
 	struct ktr_request *req;
 	struct proc *p;
 	struct ktr_io_params *kiop;
 
 	p = td->td_proc;
 	if (p->p_traceflag == 0)
 		return;
 
 	ktrace_enter(td);
 	req = ktr_getrequest_entered(td, KTR_PROCDTOR);
 	if (req != NULL)
 		ktr_enqueuerequest(td, req);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	kiop = ktr_freeproc(p);
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(kiop);
 	ktrace_exit(td);
 }
 
 static void
 ktrprocctor_entered(struct thread *td, struct proc *p)
 {
 	struct ktr_proc_ctor *ktp;
 	struct ktr_request *req;
 	struct thread *td2;
 
 	ktrace_assert(td);
 	td2 = FIRST_THREAD_IN_PROC(p);
 	req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_proc_ctor;
 	ktp->sv_flags = p->p_sysent->sv_flags;
 	ktr_enqueuerequest(td2, req);
 }
 
 void
 ktrprocctor(struct proc *p)
 {
 	struct thread *td = curthread;
 
 	if ((p->p_traceflag & KTRFAC_MASK) == 0)
 		return;
 
 	ktrace_enter(td);
 	ktrprocctor_entered(td, p);
 	ktrace_exit(td);
 }
 
 /*
  * When a process forks, enable tracing in the new process if needed.
  */
 void
 ktrprocfork(struct proc *p1, struct proc *p2)
 {
 
 	MPASS(p2->p_ktrioparms == NULL);
 	MPASS(p2->p_traceflag == 0);
 
 	if (p1->p_traceflag == 0)
 		return;
 
 	PROC_LOCK(p1);
 	mtx_lock(&ktrace_mtx);
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_ktrioparms = p1->p_ktrioparms) != NULL)
 			p1->p_ktrioparms->refs++;
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p1);
 
 	ktrprocctor(p2);
 }
 
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
  */
 void
 ktruserret(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 void
 ktrnamei(const char *path)
 {
 	struct ktr_request *req;
 	int namelen;
 	char *buf = NULL;
 
 	namelen = strlen(path);
 	if (namelen > 0) {
 		buf = malloc(namelen, M_KTRACE, M_WAITOK);
 		bcopy(path, buf, namelen);
 	}
 	req = ktr_getrequest(KTR_NAMEI);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	if (namelen > 0) {
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysctl(int *name, u_int namelen)
 {
 	struct ktr_request *req;
 	u_int mib[CTL_MAXNAME + 2];
 	char *mibname;
 	size_t mibnamelen;
 	int error;
 
 	/* Lookup name of mib. */    
 	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
 	mib[0] = 0;
 	mib[1] = 1;
 	bcopy(name, mib + 2, namelen * sizeof(*name));
 	mibnamelen = 128;
 	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
 	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
 	    NULL, 0, &mibnamelen, 0);
 	if (error) {
 		free(mibname, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_SYSCTL);
 	if (req == NULL) {
 		free(mibname, M_KTRACE);
 		return;
 	}
 	req->ktr_header.ktr_len = mibnamelen;
 	req->ktr_buffer = mibname;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrgenio(int fd, enum uio_rw rw, struct uio *uio, int error)
 {
 	struct ktr_request *req;
 	struct ktr_genio *ktg;
 	int datalen;
 	char *buf;
 
-	if (error) {
+	if (error != 0 && (rw == UIO_READ || error == EFAULT)) {
 		freeuio(uio);
 		return;
 	}
 	uio->uio_offset = 0;
 	uio->uio_rw = UIO_WRITE;
 	datalen = MIN(uio->uio_resid, ktr_geniosize);
 	buf = malloc(datalen, M_KTRACE, M_WAITOK);
 	error = uiomove(buf, datalen, uio);
 	freeuio(uio);
 	if (error) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_GENIO);
 	if (req == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ktg = &req->ktr_data.ktr_genio;
 	ktg->ktr_fd = fd;
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrpsig(int sig, sig_t action, sigset_t *mask, int code)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_psig	*kp;
 
 	req = ktr_getrequest(KTR_PSIG);
 	if (req == NULL)
 		return;
 	kp = &req->ktr_data.ktr_psig;
 	kp->signo = (char)sig;
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrcsw(int out, int user, const char *wmesg)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_csw *kc;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_CSW);
 	if (req == NULL)
 		return;
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
 	if (wmesg != NULL)
 		strlcpy(kc->wmesg, wmesg, sizeof(kc->wmesg));
 	else
 		bzero(kc->wmesg, sizeof(kc->wmesg));
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrstruct(const char *name, const void *data, size_t datalen)
 {
 	struct ktr_request *req;
 	char *buf;
 	size_t buflen, namelen;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	if (data == NULL)
 		datalen = 0;
 	namelen = strlen(name) + 1;
 	buflen = namelen + datalen;
 	buf = malloc(buflen, M_KTRACE, M_WAITOK);
 	strcpy(buf, name);
 	bcopy(data, buf + namelen, datalen);
 	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req->ktr_buffer = buf;
 	req->ktr_header.ktr_len = buflen;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrstruct_error(const char *name, const void *data, size_t datalen, int error)
 {
 
 	if (error == 0)
 		ktrstruct(name, data, datalen);
 }
 
 void
 ktrstructarray(const char *name, enum uio_seg seg, const void *data,
     int num_items, size_t struct_size)
 {
 	struct ktr_request *req;
 	struct ktr_struct_array *ksa;
 	char *buf;
 	size_t buflen, datalen, namelen;
 	int max_items;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 	if (num_items < 0)
 		return;
 
 	/* Trim array length to genio size. */
 	max_items = ktr_geniosize / struct_size;
 	if (num_items > max_items) {
 		if (max_items == 0)
 			num_items = 1;
 		else
 			num_items = max_items;
 	}
 	datalen = num_items * struct_size;
 
 	if (data == NULL)
 		datalen = 0;
 
 	namelen = strlen(name) + 1;
 	buflen = namelen + datalen;
 	buf = malloc(buflen, M_KTRACE, M_WAITOK);
 	strcpy(buf, name);
 	if (seg == UIO_SYSSPACE)
 		bcopy(data, buf + namelen, datalen);
 	else {
 		if (copyin(data, buf + namelen, datalen) != 0) {
 			free(buf, M_KTRACE);
 			return;
 		}
 	}
 	if ((req = ktr_getrequest(KTR_STRUCT_ARRAY)) == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ksa = &req->ktr_data.ktr_struct_array;
 	ksa->struct_size = struct_size;
 	req->ktr_buffer = buf;
 	req->ktr_header.ktr_len = buflen;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrcapfail(enum ktr_cap_fail_type type, const cap_rights_t *needed,
     const cap_rights_t *held)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_cap_fail *kcf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_CAPFAIL);
 	if (req == NULL)
 		return;
 	kcf = &req->ktr_data.ktr_cap_fail;
 	kcf->cap_type = type;
 	if (needed != NULL)
 		kcf->cap_needed = *needed;
 	else
 		cap_rights_init(&kcf->cap_needed);
 	if (held != NULL)
 		kcf->cap_held = *held;
 	else
 		cap_rights_init(&kcf->cap_held);
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrfault(vm_offset_t vaddr, int type)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_fault *kf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_FAULT);
 	if (req == NULL)
 		return;
 	kf = &req->ktr_data.ktr_fault;
 	kf->vaddr = vaddr;
 	kf->type = type;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrfaultend(int result)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_faultend *kf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_FAULTEND);
 	if (req == NULL)
 		return;
 	kf = &req->ktr_data.ktr_faultend;
 	kf->result = result;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
 	int	ops;
 	int	facs;
 	int	pid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ktrace(struct thread *td, struct ktrace_args *uap)
 {
 #ifdef KTRACE
 	struct vnode *vp = NULL;
 	struct proc *p;
 	struct pgrp *pg;
 	int facs = uap->facs & ~KTRFAC_ROOT;
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int ret = 0;
 	int flags, error = 0;
 	struct nameidata nd;
 	struct ktr_io_params *kiop, *old_kiop;
 
 	/*
 	 * Need something to (un)trace.
 	 */
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
 	kiop = NULL;
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
 		error = vn_open(&nd, &flags, 0, NULL);
 		if (error)
 			return (error);
 		NDFREE_PNBUF(&nd);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp);
 		if (vp->v_type != VREG) {
 			(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 			return (EACCES);
 		}
 		kiop = ktr_io_params_alloc(td, vp);
 	}
 
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	ktrace_enter(td);
 	if (ops == KTROP_CLEARFILE) {
 restart:
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			old_kiop = NULL;
 			PROC_LOCK(p);
 			if (p->p_ktrioparms != NULL &&
 			    p->p_ktrioparms->vp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
 					old_kiop = ktr_freeproc(p);
 					mtx_unlock(&ktrace_mtx);
 				} else
 					error = EPERM;
 			}
 			PROC_UNLOCK(p);
 			if (old_kiop != NULL) {
 				sx_sunlock(&allproc_lock);
 				ktr_io_params_free(old_kiop);
 				goto restart;
 			}
 		}
 		sx_sunlock(&allproc_lock);
 		goto done;
 	}
 	/*
 	 * do it
 	 */
 	sx_slock(&proctree_lock);
 	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
 		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 
 		/*
 		 * ktrops() may call vrele(). Lock pg_members
 		 * by the proctree_lock rather than pg_mtx.
 		 */
 		PGRP_UNLOCK(pg);
 		if (LIST_EMPTY(&pg->pg_members)) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (descend)
 				ret |= ktrsetchildren(td, p, ops, facs, kiop);
 			else
 				ret |= ktrops(td, p, ops, facs, kiop);
 		}
 	} else {
 		/*
 		 * by pid
 		 */
 		p = pfind(uap->pid);
 		if (p == NULL) {
 			error = ESRCH;
 			sx_sunlock(&proctree_lock);
 			goto done;
 		}
 		if (descend)
 			ret |= ktrsetchildren(td, p, ops, facs, kiop);
 		else
 			ret |= ktrops(td, p, ops, facs, kiop);
 	}
 	sx_sunlock(&proctree_lock);
 	if (!ret)
 		error = EPERM;
 done:
 	if (kiop != NULL) {
 		mtx_lock(&ktrace_mtx);
 		kiop = ktr_io_params_rele(kiop);
 		mtx_unlock(&ktrace_mtx);
 		ktr_io_params_free(kiop);
 	}
 	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 /* ARGSUSED */
 int
 sys_utrace(struct thread *td, struct utrace_args *uap)
 {
 
 #ifdef KTRACE
 	struct ktr_request *req;
 	void *cp;
 	int error;
 
 	if (!KTRPOINT(td, KTR_USER))
 		return (0);
 	if (uap->len > KTR_USER_MAXLEN)
 		return (EINVAL);
 	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
 	error = copyin(uap->addr, cp, uap->len);
 	if (error) {
 		free(cp, M_KTRACE);
 		return (error);
 	}
 	req = ktr_getrequest(KTR_USER);
 	if (req == NULL) {
 		free(cp, M_KTRACE);
 		return (ENOMEM);
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
 	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 #ifdef KTRACE
 static int
 ktrops(struct thread *td, struct proc *p, int ops, int facs,
     struct ktr_io_params *new_kiop)
 {
 	struct ktr_io_params *old_kiop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!ktrcanset(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	if ((ops == KTROP_SET && p->p_state == PRS_NEW) ||
 	    p_cansee(td, p) != 0) {
 		/*
 		 * Disallow setting trace points if the process is being born.
 		 * This avoids races with trace point inheritance in
 		 * ktrprocfork().
 		 */
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	if ((p->p_flag & P_WEXIT) != 0) {
 		/*
 		 * There's nothing to do if the process is exiting, but avoid
 		 * signaling an error.
 		 */
 		PROC_UNLOCK(p);
 		return (1);
 	}
 	old_kiop = NULL;
 	mtx_lock(&ktrace_mtx);
 	if (ops == KTROP_SET) {
 		if (p->p_ktrioparms != NULL &&
 		    p->p_ktrioparms->vp != new_kiop->vp) {
 			/* if trace file already in use, relinquish below */
 			old_kiop = ktr_io_params_rele(p->p_ktrioparms);
 			p->p_ktrioparms = NULL;
 		}
 		if (p->p_ktrioparms == NULL) {
 			p->p_ktrioparms = new_kiop;
 			ktr_io_params_ref(new_kiop);
 		}
 		p->p_traceflag |= facs;
 		if (priv_check(td, PRIV_KTRACE) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
 			/* no more tracing */
 			old_kiop = ktr_freeproc(p);
 	}
 	mtx_unlock(&ktrace_mtx);
 	if ((p->p_traceflag & KTRFAC_MASK) != 0)
 		ktrprocctor_entered(td, p);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(old_kiop);
 
 	return (1);
 }
 
 static int
 ktrsetchildren(struct thread *td, struct proc *top, int ops, int facs,
     struct ktr_io_params *new_kiop)
 {
 	struct proc *p;
 	int ret = 0;
 
 	p = top;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= ktrops(td, p, ops, facs, new_kiop);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				return (ret);
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 		PROC_LOCK(p);
 	}
 	/*NOTREACHED*/
 }
 
 static void
 ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_io_params *kiop, *kiop1;
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	off_t lim;
 	int datalen, buflen;
 	int error;
 
 	p = td->td_proc;
 
 	/*
 	 * We reference the kiop for use in I/O in case ktrace is
 	 * disabled on the process as we write out the request.
 	 */
 	mtx_lock(&ktrace_mtx);
 	kiop = p->p_ktrioparms;
 
 	/*
 	 * If kiop is NULL, it has been cleared out from under this
 	 * request, so just drop it.
 	 */
 	if (kiop == NULL) {
 		mtx_unlock(&ktrace_mtx);
 		return;
 	}
 
 	ktr_io_params_ref(kiop);
 	vp = kiop->vp;
 	cred = kiop->cr;
 	lim = kiop->lim;
 
 	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
 	mtx_unlock(&ktrace_mtx);
 
 	kth = &req->ktr_header;
 	KASSERT(((u_short)kth->ktr_type & ~KTR_TYPE) < nitems(data_lengths),
 	    ("data_lengths array overflow"));
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_TYPE];
 	buflen = kth->ktr_len;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	aiov[0].iov_base = (caddr_t)kth;
 	aiov[0].iov_len = sizeof(struct ktr_header);
 	auio.uio_resid = sizeof(struct ktr_header);
 	auio.uio_iovcnt = 1;
 	auio.uio_td = td;
 	if (datalen != 0) {
 		aiov[1].iov_base = (caddr_t)&req->ktr_data;
 		aiov[1].iov_len = datalen;
 		auio.uio_resid += datalen;
 		auio.uio_iovcnt++;
 		kth->ktr_len += datalen;
 	}
 	if (buflen != 0) {
 		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
 		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
 		aiov[auio.uio_iovcnt].iov_len = buflen;
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	td->td_ktr_io_lim = lim;
 #ifdef MAC
 	error = mac_vnode_check_write(cred, NOCRED, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	if (error == 0) {
 		mtx_lock(&ktrace_mtx);
 		kiop = ktr_io_params_rele(kiop);
 		mtx_unlock(&ktrace_mtx);
 		ktr_io_params_free(kiop);
 		return;
 	}
 
 	/*
 	 * If error encountered, give up tracing on this vnode on this
 	 * process.  Other processes might still be suitable for
 	 * writes to this vnode.
 	 */
 	log(LOG_NOTICE,
 	    "ktrace write failed, errno %d, tracing stopped for pid %d\n",
 	    error, p->p_pid);
 
 	kiop1 = NULL;
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	if (p->p_ktrioparms != NULL && p->p_ktrioparms->vp == vp)
 		kiop1 = ktr_freeproc(p);
 	kiop = ktr_io_params_rele(kiop);
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(kiop1);
 	ktr_io_params_free(kiop);
 }
 
 /*
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
  * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  */
 static int
 ktrcanset(struct thread *td, struct proc *targetp)
 {
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
 	    priv_check(td, PRIV_KTRACE))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
 		return (0);
 
 	return (1);
 }
 
 #endif /* KTRACE */
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index f6190c3fb05f..b926f98892a5 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1,2173 +1,2174 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/eventfd.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
 #include <sys/specialfd.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 
 /*
  * The following macro defines how many bytes will be allocated from
  * the stack instead of memory allocated when passing the IOCTL data
  * structures from userspace and to the kernel. Some IOCTLs having
  * small data structures are used very frequently and this small
  * buffer on the stack gives a significant speedup improvement for
  * those requests. The value of this define should be greater or equal
  * to 64 bytes and should also be power of two. The data structure is
  * currently hard-aligned to a 8-byte boundary on the stack. This
  * should currently be sufficient for all supported platforms.
  */
 #define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
 #define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
 
 #ifdef __LP64__
 static int iosize_max_clamp = 0;
 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
     &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
 static int devfs_iosize_max_clamp = 1;
 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
     &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
 #endif
 
 /*
  * Assert that the return value of read(2) and write(2) syscalls fits
  * into a register.  If not, an architecture will need to provide the
  * usermode wrappers to reconstruct the result.
  */
 CTASSERT(sizeof(register_t) >= sizeof(size_t));
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
 		    u_int);
 static int	pollscan(struct thread *, struct pollfd *, u_int);
 static int	pollrescan(struct thread *);
 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
 static void	selfdalloc(struct thread *, void *);
 static void	selfdfree(struct seltd *, struct selfd *);
 static int	dofileread(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 static void	seltdinit(struct thread *);
 static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
 static void	seltdclear(struct thread *);
 
 /*
  * One seltd per-thread allocated on demand as needed.
  *
  *	t - protected by st_mtx
  * 	k - Only accessed by curthread or read-only
  */
 struct seltd {
 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
 	struct selfd		*st_free1;	/* (k) free fd for read set. */
 	struct selfd		*st_free2;	/* (k) free fd for write set. */
 	struct mtx		st_mtx;		/* Protects struct seltd */
 	struct cv		st_wait;	/* (t) Wait channel. */
 	int			st_flags;	/* (t) SELTD_ flags. */
 };
 
 #define	SELTD_PENDING	0x0001			/* We have pending events. */
 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
 
 /*
  * One selfd allocated per-thread per-file-descriptor.
  *	f - protected by sf_mtx
  */
 struct selfd {
 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
 	struct seltd		*sf_td;		/* (k) owning seltd. */
 	void			*sf_cookie;	/* (k) fd or pollfd. */
 };
 
 MALLOC_DEFINE(M_SELFD, "selfd", "selfd");
 static struct mtx_pool *mtxpool_select;
 
 #ifdef __LP64__
 size_t
 devfs_iosize_max(void)
 {
 
 	return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 
 size_t
 iosize_max(void)
 {
 
 	return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_read(struct thread *td, struct read_args *uap)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_readv(td, uap->fd, &auio);
 	return (error);
 }
 
 /*
  * Positioned read system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pread(struct thread *td, struct pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 
 int
 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_preadv(td, fd, &auio, offset);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 #endif
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_readv(struct thread *td, struct readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	freeuio(auio);
 	return (error);
 }
 
 int
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &cap_read_rights, &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Scatter positioned read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct preadv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_preadv(struct thread *td, struct preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, uap->offset);
 	freeuio(auio);
 	return (error);
 }
 
 int
 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &cap_pread_rights, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for readv and preadv that reads data in
  * from a file using the passed in uio, offset, and flags.
  */
 static int
 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio,
     off_t offset, int flags)
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	AUDIT_ARG_FD(fd);
 
 	/* Finish zero length reads right here */
 	if (auio->uio_resid == 0) {
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	auio->uio_rw = UIO_READ;
 	auio->uio_offset = offset;
 	auio->uio_td = td;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) 
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_READ, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_write(struct thread *td, struct write_args *uap)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_writev(td, uap->fd, &auio);
 	return (error);
 }
 
 /*
  * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pwrite(struct thread *td, struct pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 
 int
 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
     off_t offset)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_pwritev(td, fd, &auio, offset);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 #endif
 
 /*
  * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_writev(struct thread *td, struct writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	freeuio(auio);
 	return (error);
 }
 
 int
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &cap_write_rights, &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_pwritev(struct thread *td, struct pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
 	freeuio(auio);
 	return (error);
 }
 
 int
 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for writev and pwritev that writes data to
  * a file using the passed in uio, offset, and flags.
  */
 static int
 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio,
     off_t offset, int flags)
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	AUDIT_ARG_FD(fd);
 	auio->uio_rw = UIO_WRITE;
 	auio->uio_td = td;
 	auio->uio_offset = offset;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	error = fo_write(fp, auio, td->td_ucred, flags, td);
 	/*
 	 * Socket layer is responsible for special error handling,
 	 * see sousrsend().
 	 */
 	if (error != 0 && fp->f_type != DTYPE_SOCKET) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
-		ktruio->uio_resid = cnt;
+		if (error == 0)
+			ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_WRITE, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  *
  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
  * descriptor isn't writable.
  */
 int
 kern_ftruncate(struct thread *td, int fd, off_t length)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
 		return (EINVAL);
 	error = fget(td, fd, &cap_ftruncate_rights, &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if (!(fp->f_flag & FWRITE)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = fo_truncate(fp, length, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_ftruncate(struct thread *td, struct ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 int
 oftruncate(struct thread *td, struct oftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ioctl(struct thread *td, struct ioctl_args *uap)
 {
 	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
 	uint32_t com;
 	int arg, error;
 	u_int size;
 	caddr_t data;
 
 #ifdef INVARIANTS
 	if (uap->com > 0xffffffff) {
 		printf(
 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
 		    td->td_proc->p_pid, td->td_name, uap->com);
 	}
 #endif
 	com = (uint32_t)uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if ((size > IOCPARM_MAX) ||
 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	    ((com & IOC_OUT) && size == 0) ||
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
 
 	if (size > 0) {
 		if (com & IOC_VOID) {
 			/* Integer argument. */
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
 		} else {
 			if (size > SYS_IOCTL_SMALL_SIZE)
 				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 			else
 				data = smalldata;
 		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error != 0)
 			goto out;
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	}
 
 	error = kern_ioctl(td, uap->fd, com, data);
 
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
 out:
 	if (size > SYS_IOCTL_SMALL_SIZE)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
 
 int
 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	int error, tmp, locked;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(com);
 
 	fdp = td->td_proc->p_fd;
 
 	switch (com) {
 	case FIONCLEX:
 	case FIOCLEX:
 		FILEDESC_XLOCK(fdp);
 		locked = LA_XLOCKED;
 		break;
 	default:
 #ifdef CAPABILITIES
 		FILEDESC_SLOCK(fdp);
 		locked = LA_SLOCKED;
 #else
 		locked = LA_UNLOCKED;
 #endif
 		break;
 	}
 
 #ifdef CAPABILITIES
 	if ((fp = fget_noref(fdp, fd)) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
 		fp = NULL;	/* fhold() was not called yet */
 		goto out;
 	}
 	if (!fhold(fp)) {
 		error = EBADF;
 		fp = NULL;
 		goto out;
 	}
 	if (locked == LA_SLOCKED) {
 		FILEDESC_SUNLOCK(fdp);
 		locked = LA_UNLOCKED;
 	}
 #else
 	error = fget(td, fd, &cap_ioctl_rights, &fp);
 	if (error != 0) {
 		fp = NULL;
 		goto out;
 	}
 #endif
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	switch (com) {
 	case FIONCLEX:
 		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
 		goto out;
 	case FIOCLEX:
 		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
 		goto out;
 	case FIONBIO:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FASYNC);
 		else
 			atomic_clear_int(&fp->f_flag, FASYNC);
 		data = (void *)&tmp;
 		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	switch (locked) {
 	case LA_XLOCKED:
 		FILEDESC_XUNLOCK(fdp);
 		break;
 #ifdef CAPABILITIES
 	case LA_SLOCKED:
 		FILEDESC_SUNLOCK(fdp);
 		break;
 #endif
 	default:
 		FILEDESC_UNLOCK_ASSERT(fdp);
 		break;
 	}
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
 {
 	int error;
 
 	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
 	return (kern_posix_error(td, error));
 }
 
 int
 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (offset < 0 || len <= 0)
 		return (EINVAL);
 	/* Check for wrap. */
 	if (offset > OFF_MAX - len)
 		return (EFBIG);
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	error = fo_fallocate(fp, offset, len, td);
  out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_fspacectl(struct thread *td, struct fspacectl_args *uap)
 {
 	struct spacectl_range rqsr, rmsr;
 	int error, cerror;
 
 	error = copyin(uap->rqsr, &rqsr, sizeof(rqsr));
 	if (error != 0)
 		return (error);
 
 	error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
 	    &rmsr);
 	if (uap->rmsr != NULL) {
 		cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr));
 		if (error == 0)
 			error = cerror;
 	}
 	return (error);
 }
 
 int
 kern_fspacectl(struct thread *td, int fd, int cmd,
     const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp)
 {
 	struct file *fp;
 	struct spacectl_range rmsr;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(cmd);
 	AUDIT_ARG_FFLAGS(flags);
 
 	if (rqsr == NULL)
 		return (EINVAL);
 	rmsr = *rqsr;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 
 	if (cmd != SPACECTL_DEALLOC ||
 	    rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
 	    rqsr->r_offset > OFF_MAX - rqsr->r_len ||
 	    (flags & ~SPACECTL_F_SUPPORTED) != 0)
 		return (EINVAL);
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags,
 	    td->td_ucred, td);
 	/* fspacectl is not restarted after signals if the file is modified. */
 	if (rmsr.r_len != rqsr->r_len && (error == ERESTART ||
 	    error == EINTR || error == EWOULDBLOCK))
 		error = 0;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
 	struct file *fp;
 	struct specialfd_eventfd *ae;
 	int error, fd, fflags;
 
 	fflags = 0;
 	error = falloc_noinstall(td, &fp);
 	if (error != 0)
 		return (error);
 
 	switch (type) {
 	case SPECIALFD_EVENTFD:
 		ae = arg;
 		if ((ae->flags & EFD_CLOEXEC) != 0)
 			fflags |= O_CLOEXEC;
 		error = eventfd_create_file(td, fp, ae->initval, ae->flags);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error == 0)
 		error = finstall(td, fp, &fd, fflags, NULL);
 	fdrop(fp, td);
 	if (error == 0)
 		td->td_retval[0] = fd;
 	return (error);
 }
 
 int
 sys___specialfd(struct thread *td, struct __specialfd_args *args)
 {
 	struct specialfd_eventfd ae;
 	int error;
 
 	switch (args->type) {
 	case SPECIALFD_EVENTFD:
 		if (args->len != sizeof(struct specialfd_eventfd)) {
 			error = EINVAL;
 			break;
 		}
 		error = copyin(args->req, &ae, sizeof(ae));
 		if (error != 0)
 			break;
 		if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
 		    EFD_SEMAPHORE)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		error = kern_specialfd(td, args->type, &ae);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 poll_no_poll(int events)
 {
 	/*
 	 * Return true for read/write.  If the user asked for something
 	 * special, return POLLNVAL, so that clients have a way of
 	 * determining reliably whether or not the extended
 	 * functionality is present without hard-coding knowledge
 	 * of specific filesystem implementations.
 	 */
 	if (events & ~POLLSTANDARD)
 		return (POLLNVAL);
 
 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 int
 sys_pselect(struct thread *td, struct pselect_args *uap)
 {
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error != 0)
 		    return (error);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, NFDBITS));
 }
 
 int
 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
     struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
 {
 	int error;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error != 0)
 			return (error);
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		ast_sched(td, TDA_SIGSUSPEND);
 	}
 	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
 int
 sys_select(struct thread *td, struct select_args *uap)
 {
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv, sizeof(tv));
 		if (error)
 			return (error);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    NFDBITS));
 }
 
 /*
  * In the unlikely case when user specified n greater then the last
  * open file descriptor, check that no bits are set after the last
  * valid fd.  We must return EBADF if any is set.
  *
  * There are applications that rely on the behaviour.
  *
  * nd is fd_nfiles.
  */
 static int
 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
 {
 	char *addr, *oaddr;
 	int b, i, res;
 	uint8_t bits;
 
 	if (nd >= ndu || fd_in == NULL)
 		return (0);
 
 	oaddr = NULL;
 	bits = 0; /* silence gcc */
 	for (i = nd; i < ndu; i++) {
 		b = i / NBBY;
 #if BYTE_ORDER == LITTLE_ENDIAN
 		addr = (char *)fd_in + b;
 #else
 		addr = (char *)fd_in;
 		if (abi_nfdbits == NFDBITS) {
 			addr += rounddown(b, sizeof(fd_mask)) +
 			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
 		} else {
 			addr += rounddown(b, sizeof(uint32_t)) +
 			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
 		}
 #endif
 		if (addr != oaddr) {
 			res = fubyte(addr);
 			if (res == -1)
 				return (EFAULT);
 			oaddr = addr;
 			bits = res;
 		}
 		if ((bits & (1 << (i % NBBY))) != 0)
 			return (EBADF);
 	}
 	return (0);
 }
 
 int
 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
     fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
 {
 	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
 	int error, lf, ndu;
 
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	ndu = nd;
 	lf = fdp->fd_nfiles;
 	if (nd > lf)
 		nd = lf;
 
 	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
 	nbufbytes = 0;
 	if (fd_in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 #define	getbits(name, x) \
 	do {								\
 		if (name == NULL) {					\
 			ibits[x] = NULL;				\
 			obits[x] = NULL;				\
 		} else {						\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(name, ibits[x], ncpubytes);	\
 			if (error != 0)					\
 				goto done;				\
 			if (ncpbytes != ncpubytes)			\
 				bzero((char *)ibits[x] + ncpubytes,	\
 				    ncpbytes - ncpubytes);		\
 		}							\
 	} while (0)
 	getbits(fd_in, 0);
 	getbits(fd_ou, 1);
 	getbits(fd_ex, 2);
 #undef	getbits
 
 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
 	/*
 	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
 	 * we are running under 32-bit emulation. This should be more
 	 * generic.
 	 */
 #define swizzle_fdset(bits)						\
 	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
 		int i;							\
 		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
 			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
 	}
 #else
 #define swizzle_fdset(bits)
 #endif
 
 	/* Make sure the bit order makes it through an ABI transition */
 	swizzle_fdset(ibits[0]);
 	swizzle_fdset(ibits[1]);
 	swizzle_fdset(ibits[2]);
 
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	precision = 0;
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
 		    rtv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 done:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 
 	/* swizzle bit order back, if necessary */
 	swizzle_fdset(obits[0]);
 	swizzle_fdset(obits[1]);
 	swizzle_fdset(obits[2]);
 #undef swizzle_fdset
 
 #define	putbits(name, x) \
 	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(fd_in, 0);
 		putbits(fd_ou, 1);
 		putbits(fd_ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 
 	return (error);
 }
 /* 
  * Convert a select bit set to poll flags.
  *
  * The backend always returns POLLHUP/POLLERR if appropriate and we
  * return this as a set bit in any set.
  */
 static const int select_flags[3] = {
     POLLRDNORM | POLLHUP | POLLERR,
     POLLWRNORM | POLLHUP | POLLERR,
     POLLRDBAND | POLLERR
 };
 
 /*
  * Compute the fo_poll flags required for a fd given by the index and
  * bit position in the fd_mask array.
  */
 static __inline int
 selflags(fd_mask **ibits, int idx, fd_mask bit)
 {
 	int flags;
 	int msk;
 
 	flags = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		flags |= select_flags[msk];
 	}
 	return (flags);
 }
 
 /*
  * Set the appropriate output bits given a mask of fired events and the
  * input bits originally requested.
  */
 static __inline int
 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
 {
 	int msk;
 	int n;
 
 	n = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if ((events & select_flags[msk]) == 0)
 			continue;
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		/*
 		 * XXX Check for a duplicate set.  This can occur because a
 		 * socket calls selrecord() twice for each poll() call
 		 * resulting in two selfds per real fd.  selrescan() will
 		 * call selsetbits twice as a result.
 		 */
 		if ((obits[msk][idx] & bit) != 0)
 			continue;
 		obits[msk][idx] |= bit;
 		n++;
 	}
 
 	return (n);
 }
 
 /*
  * Traverse the list of fds attached to this thread's seltd and check for
  * completion.
  */
 static int
 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
 {
 	struct filedesc *fdp;
 	struct selinfo *si;
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct file *fp;
 	fd_mask bit;
 	int fd, ev, n, idx;
 	int error;
 	bool only_user;
 
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	n = 0;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (int)(uintptr_t)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		if (only_user)
 			error = fget_only_user(fdp, fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0))
 			return (error);
 		idx = fd / NFDBITS;
 		bit = (fd_mask)1 << (fd % NFDBITS);
 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		if (ev != 0)
 			n += selsetbits(ibits, obits, idx, bit, ev);
 	}
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * Perform the initial filedescriptor scan and register ourselves with
  * each selinfo.
  */
 static int
 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	fd_mask bit;
 	int ev, flags, end, fd;
 	int n, idx;
 	int error;
 	bool only_user;
 
 	fdp = td->td_proc->p_fd;
 	n = 0;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	for (idx = 0, fd = 0; fd < nfd; idx++) {
 		end = imin(fd + NFDBITS, nfd);
 		for (bit = 1; fd < end; bit <<= 1, fd++) {
 			/* Compute the list of events we're interested in. */
 			flags = selflags(ibits, idx, bit);
 			if (flags == 0)
 				continue;
 			if (only_user)
 				error = fget_only_user(fdp, fd, &cap_event_rights, &fp);
 			else
 				error = fget_unlocked(td, fd, &cap_event_rights, &fp);
 			if (__predict_false(error != 0))
 				return (error);
 			selfdalloc(td, (void *)(uintptr_t)fd);
 			ev = fo_poll(fp, flags, td->td_ucred, td);
 			if (only_user)
 				fput_only_user(fdp, fp);
 			else
 				fdrop(fp, td);
 			if (ev != 0)
 				n += selsetbits(ibits, obits, idx, bit, ev);
 		}
 	}
 
 	td->td_retval[0] = n;
 	return (0);
 }
 
 int
 sys_poll(struct thread *td, struct poll_args *uap)
 {
 	struct timespec ts, *tsp;
 
 	if (uap->timeout != INFTIM) {
 		if (uap->timeout < 0)
 			return (EINVAL);
 		ts.tv_sec = uap->timeout / 1000;
 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
 }
 
 /*
  * kfds points to an array in the kernel.
  */
 int
 kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds,
     struct timespec *tsp, sigset_t *uset)
 {
 	sbintime_t sbt, precision, tmp;
 	time_t over;
 	struct timespec ts;
 	int error;
 
 	precision = 0;
 	if (tsp != NULL) {
 		if (!timespecvalid_interval(tsp))
 			return (EINVAL);
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			sbt = 0;
 		else {
 			ts = *tsp;
 			if (ts.tv_sec > INT32_MAX / 2) {
 				over = ts.tv_sec - INT32_MAX / 2;
 				ts.tv_sec -= over;
 			} else
 				over = 0;
 			tmp = tstosbt(ts);
 			precision = tmp;
 			precision >>= tc_precexp;
 			if (TIMESEL(&sbt, tmp))
 				sbt += tc_tick_sbt;
 			sbt += tmp;
 		}
 	} else
 		sbt = -1;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error)
 			return (error);
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		ast_sched(td, TDA_SIGSUSPEND);
 	}
 
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = pollscan(td, kfds, nfds);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, sbt, precision);
 		if (error)
 			break;
 		error = pollrescan(td);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	return (error);
 }
 
 int
 sys_ppoll(struct thread *td, struct ppoll_args *uap)
 {
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
 
 /*
  * ufds points to an array in user space.
  */
 int
 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds,
     struct timespec *tsp, sigset_t *set)
 {
 	struct pollfd *kfds;
 	struct pollfd stackfds[32];
 	int error;
 
 	if (kern_poll_maxfds(nfds))
 		return (EINVAL);
 	if (nfds > nitems(stackfds))
 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
 	else
 		kfds = stackfds;
 	error = copyin(ufds, kfds, nfds * sizeof(*kfds));
 	if (error != 0)
 		goto out;
 
 	error = kern_poll_kfds(td, kfds, nfds, tsp, set);
 	if (error == 0)
 		error = pollout(td, kfds, ufds, nfds);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("pollfd", UIO_USERSPACE, ufds, nfds,
 		    sizeof(*ufds));
 #endif
 
 out:
 	if (nfds > nitems(stackfds))
 		free(kfds, M_TEMP);
 	return (error);
 }
 
 bool
 kern_poll_maxfds(u_int nfds)
 {
 
 	/*
 	 * This is kinda bogus.  We have fd limits, but that is not
 	 * really related to the size of the pollfd array.  Make sure
 	 * we let the process use at least FD_SETSIZE entries and at
 	 * least enough for the system-wide limits.  We want to be reasonably
 	 * safe, but not overly restrictive.
 	 */
 	return (nfds > maxfilesperproc && nfds > FD_SETSIZE);
 }
 
 static int
 pollrescan(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct selinfo *si;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct pollfd *fd;
 	int n, error;
 	bool only_user;
 
 	n = 0;
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (struct pollfd *)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		if (only_user)
 			error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0)) {
 			fd->revents = POLLNVAL;
 			n++;
 			continue;
 		}
 		/*
 		 * Note: backend also returns POLLHUP and
 		 * POLLERR if appropriate.
 		 */
 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		if (fd->revents != 0)
 			n++;
 	}
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 static int
 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
 {
 	int error = 0;
 	u_int i = 0;
 	u_int n = 0;
 
 	for (i = 0; i < nfd; i++) {
 		error = copyout(&fds->revents, &ufds->revents,
 		    sizeof(ufds->revents));
 		if (error)
 			return (error);
 		if (fds->revents != 0)
 			n++;
 		fds++;
 		ufds++;
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 static int
 pollscan(struct thread *td, struct pollfd *fds, u_int nfd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i, n, error;
 	bool only_user;
 
 	n = 0;
 	fdp = td->td_proc->p_fd;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd < 0) {
 			fds->revents = 0;
 			continue;
 		}
 		if (only_user)
 			error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0)) {
 			fds->revents = POLLNVAL;
 			n++;
 			continue;
 		}
 		/*
 		 * Note: backend also returns POLLHUP and
 		 * POLLERR if appropriate.
 		 */
 		selfdalloc(td, fds);
 		fds->revents = fo_poll(fp, fds->events,
 		    td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		/*
 		 * POSIX requires POLLOUT to be never
 		 * set simultaneously with POLLHUP.
 		 */
 		if ((fds->revents & POLLHUP) != 0)
 			fds->revents &= ~POLLOUT;
 
 		if (fds->revents != 0)
 			n++;
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * XXX This was created specifically to support netncp and netsmb.  This
  * allows the caller to specify a socket to wait for events on.  It returns
  * 0 if any events matched and an error otherwise.  There is no way to
  * determine which events fired.
  */
 int
 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 {
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	int error;
 
 	precision = 0;	/* stupid gcc! */
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
 		    rtv.tv_usec >= 1000000)
 			return (EINVAL);
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/*
 	 * Iterate until the timeout expires or the socket becomes ready.
 	 */
 	for (;;) {
 		selfdalloc(td, NULL);
 		if (sopoll(so, events, NULL, td) != 0) {
 			error = 0;
 			break;
 		}
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 	}
 	seltdclear(td);
 	/* XXX Duplicates ncp/smb behavior. */
 	if (error == ERESTART)
 		error = 0;
 	return (error);
 }
 
 /*
  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
  * have two select sets, one for read and another for write.
  */
 static void
 selfdalloc(struct thread *td, void *cookie)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp->st_free1 == NULL)
 		stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO);
 	stp->st_free1->sf_td = stp;
 	stp->st_free1->sf_cookie = cookie;
 	if (stp->st_free2 == NULL)
 		stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO);
 	stp->st_free2->sf_td = stp;
 	stp->st_free2->sf_cookie = cookie;
 }
 
 static void
 selfdfree(struct seltd *stp, struct selfd *sfp)
 {
 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
 	/*
 	 * Paired with doselwakeup.
 	 */
 	if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) {
 		mtx_lock(sfp->sf_mtx);
 		if (sfp->sf_si != NULL) {
 			TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
 		}
 		mtx_unlock(sfp->sf_mtx);
 	}
 	free(sfp, M_SELFD);
 }
 
 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */
 void
 seldrain(struct selinfo *sip)
 {
 
 	/*
 	 * This feature is already provided by doselwakeup(), thus it is
 	 * enough to go for it.
 	 * Eventually, the context, should take care to avoid races
 	 * between thread calling select()/poll() and file descriptor
 	 * detaching, but, again, the races are just the same as
 	 * selwakeup().
 	 */
         doselwakeup(sip, -1);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(struct thread *selector, struct selinfo *sip)
 {
 	struct selfd *sfp;
 	struct seltd *stp;
 	struct mtx *mtxp;
 
 	stp = selector->td_sel;
 	/*
 	 * Don't record when doing a rescan.
 	 */
 	if (stp->st_flags & SELTD_RESCAN)
 		return;
 	/*
 	 * Grab one of the preallocated descriptors.
 	 */
 	sfp = NULL;
 	if ((sfp = stp->st_free1) != NULL)
 		stp->st_free1 = NULL;
 	else if ((sfp = stp->st_free2) != NULL)
 		stp->st_free2 = NULL;
 	else
 		panic("selrecord: No free selfd on selq");
 	mtxp = sip->si_mtx;
 	if (mtxp == NULL)
 		mtxp = mtx_pool_find(mtxpool_select, sip);
 	/*
 	 * Initialize the sfp and queue it in the thread.
 	 */
 	sfp->sf_si = sip;
 	sfp->sf_mtx = mtxp;
 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
 	/*
 	 * Now that we've locked the sip, check for initialization.
 	 */
 	mtx_lock(mtxp);
 	if (sip->si_mtx == NULL) {
 		sip->si_mtx = mtxp;
 		TAILQ_INIT(&sip->si_tdlist);
 	}
 	/*
 	 * Add this thread to the list of selfds listening on this selinfo.
 	 */
 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
 	mtx_unlock(sip->si_mtx);
 }
 
 /* Wake up a selecting thread. */
 void
 selwakeup(struct selinfo *sip)
 {
 	doselwakeup(sip, -1);
 }
 
 /* Wake up a selecting thread, and set its priority. */
 void
 selwakeuppri(struct selinfo *sip, int pri)
 {
 	doselwakeup(sip, pri);
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 static void
 doselwakeup(struct selinfo *sip, int pri)
 {
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct seltd *stp;
 
 	/* If it's not initialized there can't be any waiters. */
 	if (sip->si_mtx == NULL)
 		return;
 	/*
 	 * Locking the selinfo locks all selfds associated with it.
 	 */
 	mtx_lock(sip->si_mtx);
 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
 		/*
 		 * Once we remove this sfp from the list and clear the
 		 * sf_si seltdclear will know to ignore this si.
 		 */
 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
 		stp = sfp->sf_td;
 		mtx_lock(&stp->st_mtx);
 		stp->st_flags |= SELTD_PENDING;
 		cv_broadcastpri(&stp->st_wait, pri);
 		mtx_unlock(&stp->st_mtx);
 		/*
 		 * Paired with selfdfree.
 		 *
 		 * Storing this only after the wakeup provides an invariant that
 		 * stp is not used after selfdfree returns.
 		 */
 		atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL);
 	}
 	mtx_unlock(sip->si_mtx);
 }
 
 static void
 seltdinit(struct thread *td)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp != NULL) {
 		MPASS(stp->st_flags == 0);
 		MPASS(STAILQ_EMPTY(&stp->st_selq));
 		return;
 	}
 	stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
 	cv_init(&stp->st_wait, "select");
 	stp->st_flags = 0;
 	STAILQ_INIT(&stp->st_selq);
 	td->td_sel = stp;
 }
 
 static int
 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
 {
 	struct seltd *stp;
 	int error;
 
 	stp = td->td_sel;
 	/*
 	 * An event of interest may occur while we do not hold the seltd
 	 * locked so check the pending flag before we sleep.
 	 */
 	mtx_lock(&stp->st_mtx);
 	/*
 	 * Any further calls to selrecord will be a rescan.
 	 */
 	stp->st_flags |= SELTD_RESCAN;
 	if (stp->st_flags & SELTD_PENDING) {
 		mtx_unlock(&stp->st_mtx);
 		return (0);
 	}
 	if (sbt == 0)
 		error = EWOULDBLOCK;
 	else if (sbt != -1)
 		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
 		    sbt, precision, C_ABSOLUTE);
 	else
 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
 	mtx_unlock(&stp->st_mtx);
 
 	return (error);
 }
 
 void
 seltdfini(struct thread *td)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp == NULL)
 		return;
 	MPASS(stp->st_flags == 0);
 	MPASS(STAILQ_EMPTY(&stp->st_selq));
 	if (stp->st_free1)
 		free(stp->st_free1, M_SELFD);
 	if (stp->st_free2)
 		free(stp->st_free2, M_SELFD);
 	td->td_sel = NULL;
 	cv_destroy(&stp->st_wait);
 	mtx_destroy(&stp->st_mtx);
 	free(stp, M_SELECT);
 }
 
 /*
  * Remove the references to the thread from all of the objects we were
  * polling.
  */
 static void
 seltdclear(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 
 	stp = td->td_sel;
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
 		selfdfree(stp, sfp);
 	stp->st_flags = 0;
 }
 
 static void selectinit(void *);
 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
 static void
 selectinit(void *dummy __unused)
 {
 
 	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
 }
 
 /*
  * Set up a syscall return value that follows the convention specified for
  * posix_* functions.
  */
 int
 kern_posix_error(struct thread *td, int error)
 {
 
 	if (error <= 0)
 		return (error);
 	td->td_errno = error;
 	td->td_pflags |= TDP_NERRNO;
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 kcmp_cmp(uintptr_t a, uintptr_t b)
 {
 	if (a == b)
 		return (0);
 	else if (a < b)
 		return (1);
 	return (2);
 }
 
 static int
 kcmp_pget(struct thread *td, pid_t pid, struct proc **pp)
 {
 	if (pid == td->td_proc->p_pid) {
 		*pp = td->td_proc;
 		return (0);
 	}
 	return (pget(pid, PGET_CANDEBUG | PGET_NOTWEXIT | PGET_HOLD, pp));
 }
 
 int
 kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type,
     uintptr_t idx1, uintptr_t idx2)
 {
 	struct proc *p1, *p2;
 	struct file *fp1, *fp2;
 	int error, res;
 
 	res = -1;
 	p1 = p2 = NULL;
 	error = kcmp_pget(td, pid1, &p1);
 	if (error == 0)
 		error = kcmp_pget(td, pid2, &p2);
 	if (error != 0)
 		goto out;
 
 	switch (type) {
 	case KCMP_FILE:
 	case KCMP_FILEOBJ:
 		error = fget_remote(td, p1, idx1, &fp1);
 		if (error == 0) {
 			error = fget_remote(td, p2, idx2, &fp2);
 			if (error == 0) {
 				if (type == KCMP_FILEOBJ)
 					res = fo_cmp(fp1, fp2, td);
 				else
 					res = kcmp_cmp((uintptr_t)fp1,
 					    (uintptr_t)fp2);
 				fdrop(fp2, td);
 			}
 			fdrop(fp1, td);
 		}
 		break;
 	case KCMP_FILES:
 		res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd);
 		break;
 	case KCMP_SIGHAND:
 		res = kcmp_cmp((uintptr_t)p1->p_sigacts,
 		    (uintptr_t)p2->p_sigacts);
 		break;
 	case KCMP_VM:
 		res = kcmp_cmp((uintptr_t)p1->p_vmspace,
 		    (uintptr_t)p2->p_vmspace);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 out:
 	if (p1 != NULL && p1 != td->td_proc)
 		PRELE(p1);
 	if (p2 != NULL && p2 != td->td_proc)
 		PRELE(p2);
 
 	td->td_retval[0] = res;
 	return (error);
 }
 
 int
 sys_kcmp(struct thread *td, struct kcmp_args *uap)
 {
 	return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type,
 	    uap->idx1, uap->idx2));
 }
 
 int
 file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	if (fp1->f_type != fp2->f_type)
 		return (3);
 	return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data));
 }
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index dce0ab9b53d4..b2d03d932b99 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -1,1555 +1,1556 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #ifdef COMPAT_43
 #include <sys/sysent.h>
 #endif
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, int s, struct sockaddr *uname,
 		   socklen_t *anamelen, int flags);
 static int sockargs(struct mbuf **, char *, socklen_t, int);
 
 /*
  * Convert a user file descriptor to a kernel file entry and check if required
  * capability rights are present.
  * If required copy of current set of capability rights is returned.
  * A reference on the file entry is held upon returning.
  */
 int
 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_cap(td, fd, rightsp, &fp, havecapsp);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
 		fdrop(fp, td);
 		if (havecapsp != NULL)
 			filecaps_free(havecapsp);
 		return (ENOTSOCK);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 int
 getsock(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_unlocked(td, fd, rightsp, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
 		fdrop(fp, td);
 		return (ENOTSOCK);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
 int
 sys_socket(struct thread *td, struct socket_args *uap)
 {
 
 	return (kern_socket(td, uap->domain, uap->type, uap->protocol));
 }
 
 int
 kern_socket(struct thread *td, int domain, int type, int protocol)
 {
 	struct socket *so;
 	struct file *fp;
 	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
 
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 
 #ifdef MAC
 	error = mac_socket_check_create(td->td_ucred, domain, type, protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = falloc(td, &fp, &fd, oflag);
 	if (error != 0)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	error = socreate(domain, &so, type, protocol, td->td_ucred, td);
 	if (error != 0) {
 		fdclose(td, fp, fd);
 	} else {
 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
 		if ((fflag & FNONBLOCK) != 0)
 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_bind(struct thread *td, struct bind_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bindat(td, AT_FDCWD, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock(td, fd, &cap_bind_rights, &fp);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_bind(td->td_ucred, so, sa);
 	if (error == 0) {
 #endif
 		if (dirfd == AT_FDCWD)
 			error = sobind(so, sa, td);
 		else
 			error = sobindat(dirfd, so, sa, td);
 #ifdef MAC
 	}
 #endif
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_bindat(struct thread *td, struct bindat_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bindat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 sys_listen(struct thread *td, struct listen_args *uap)
 {
 
 	return (kern_listen(td, uap->s, uap->backlog));
 }
 
 int
 kern_listen(struct thread *td, int s, int backlog)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(s);
 	error = getsock(td, s, &cap_listen_rights, &fp);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		error = mac_socket_check_listen(td->td_ucred, so);
 		if (error == 0)
 #endif
 			error = solisten(so, backlog, td);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * accept1()
  */
 static int
 accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen,
     int flags)
 {
 	struct sockaddr_storage ss = { .ss_len = sizeof(ss) };
 	socklen_t addrlen;
 	struct file *fp;
 	int error;
 
 	if (uname != NULL) {
 		error = copyin(anamelen, &addrlen, sizeof(addrlen));
 		if (error != 0)
 			return (error);
 	}
 
 	error = kern_accept4(td, s, (struct sockaddr *)&ss, flags, &fp);
 
 	if (error != 0)
 		return (error);
 
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT) &&
 	    (flags & ACCEPT4_COMPAT) != 0)
 		((struct osockaddr *)&ss)->sa_family = ss.ss_family;
 #endif
 	if (uname != NULL) {
 		addrlen = min(ss.ss_len, addrlen);
 		error = copyout(&ss, uname, addrlen);
 		if (error == 0) {
 			addrlen = ss.ss_len;
 			error = copyout(&addrlen, anamelen, sizeof(addrlen));
 		}
 	}
 	if (error != 0)
 		fdclose(td, fp, td->td_retval[0]);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr *sa, struct file **fp)
 {
 	return (kern_accept4(td, s, sa, ACCEPT4_INHERIT, fp));
 }
 
 int
 kern_accept4(struct thread *td, int s, struct sockaddr *sa, int flags,
     struct file **fp)
 {
 	struct file *headfp, *nfp = NULL;
 	struct socket *head, *so;
 	struct filecaps fcaps;
 	u_int fflag;
 	pid_t pgid;
 	int error, fd, tmp;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_accept_rights,
 	    &headfp, &fcaps);
 	if (error != 0)
 		return (error);
 	fflag = atomic_load_int(&headfp->f_flag);
 	head = headfp->f_data;
 	if (!SOLISTENING(head)) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	error = mac_socket_check_accept(td->td_ucred, head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc_caps(td, &nfp, &fd,
 	    (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
 	if (error != 0)
 		goto done;
 	SOCK_LOCK(head);
 	if (!SOLISTENING(head)) {
 		SOCK_UNLOCK(head);
 		error = EINVAL;
 		goto noconnection;
 	}
 
 	error = solisten_dequeue(head, &so, flags);
 	if (error != 0)
 		goto noconnection;
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* Connection has been removed from the listen queue. */
 	KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
 
 	if (flags & ACCEPT4_INHERIT) {
 		pgid = fgetown(&head->so_sigio);
 		if (pgid != 0)
 			fsetown(pgid, &so->so_sigio);
 	} else {
 		fflag &= ~(FNONBLOCK | FASYNC);
 		if (flags & SOCK_NONBLOCK)
 			fflag |= FNONBLOCK;
 	}
 
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 
 	if ((error = soaccept(so, sa)) == 0) {
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_STRUCT))
 			ktrsockaddr(sa);
 #endif
 	}
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(td, nfp, fd);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (nfp == NULL)
 		filecaps_free(&fcaps);
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 	return (error);
 }
 
 int
 sys_accept(struct thread *td, struct accept_args *uap)
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 }
 
 int
 sys_accept4(struct thread *td, struct accept4_args *uap)
 {
 
 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return (EINVAL);
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(struct thread *td, struct oaccept_args *uap)
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen,
 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sys_connect(struct thread *td, struct connect_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connectat(td, AT_FDCWD, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock(td, fd, &cap_connect_rights, &fp);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_connect(td->td_ucred, so, sa);
 	if (error != 0)
 		goto bad;
 #endif
 	error = soconnectat(dirfd, so, sa, td);
 	if (error != 0)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH,
 		    "connec", 0);
 		if (error != 0)
 			break;
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_connectat(struct thread *td, struct connectat_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connectat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_socketpair(struct thread *td, int domain, int type, int protocol,
     int *rsv)
 {
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
 
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_socket_check_create(td->td_ucred, domain, type,
 	    protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		return (error);
 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd, oflag);
 	if (error != 0)
 		goto free2;
 	rsv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd, oflag);
 	if (error != 0)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	rsv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error != 0)
 		goto free4;
 	if (type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error != 0)
 			goto free4;
 	} else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) {
 		struct unpcb *unp, *unp2;
 		unp = sotounpcb(so1);
 		unp2 = sotounpcb(so2);
 		/* 
 		 * No need to lock the unps, because the sockets are brand-new.
 		 * No other threads can be using them yet
 		 */
 		unp_copy_peercred(td, unp, unp2, unp);
 	}
 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
 	    &socketops);
 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
 	    &socketops);
 	if ((fflag & FNONBLOCK) != 0) {
 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
 	}
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	return (0);
 free4:
 	fdclose(td, fp2, rsv[1]);
 	fdrop(fp2, td);
 free3:
 	fdclose(td, fp1, rsv[0]);
 	fdrop(fp1, td);
 free2:
 	if (so2 != NULL)
 		(void)soclose(so2);
 free1:
 	if (so1 != NULL)
 		(void)soclose(so1);
 	return (error);
 }
 
 int
 sys_socketpair(struct thread *td, struct socketpair_args *uap)
 {
 	int error, sv[2];
 
 	error = kern_socketpair(td, uap->domain, uap->type,
 	    uap->protocol, sv);
 	if (error != 0)
 		return (error);
 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
 	if (error != 0) {
 		(void)kern_close(td, sv[0]);
 		(void)kern_close(td, sv[1]);
 	}
 	return (error);
 }
 
 static int
 sendit(struct thread *td, int s, struct msghdr *mp, int flags)
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
 		return (ECAPMODE);
 #endif
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error != 0) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && (mp->msg_flags != MSG_COMPAT ||
 		    !SV_PROC_FLAG(td->td_proc, SV_AOUT))
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error != 0)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT)) {
 			struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
 			cm = mtod(control, struct cmsghdr *);
 			cm->cmsg_len = control->m_len;
 			cm->cmsg_level = SOL_SOCKET;
 			cm->cmsg_type = SCM_RIGHTS;
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
     struct mbuf *control, enum uio_seg segflg)
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	cap_rights_t *rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int i, error;
 
 	AUDIT_ARG_FD(s);
 	rights = &cap_send_rights;
 	if (mp->msg_name != NULL) {
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
 		rights = &cap_send_connect_rights;
 	}
 	error = getsock(td, s, rights, &fp);
 	if (error != 0) {
 		m_freem(control);
 		return (error);
 	}
 	so = (struct socket *)fp->f_data;
 
 #ifdef KTRACE
 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(mp->msg_name);
 #endif
 #ifdef MAC
 	if (mp->msg_name != NULL) {
 		error = mac_socket_check_connect(td->td_ucred, so,
 		    mp->msg_name);
 		if (error != 0) {
 			m_freem(control);
 			goto bad;
 		}
 	}
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0) {
 		m_freem(control);
 		goto bad;
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			m_freem(control);
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sousrsend(so, mp->msg_name, &auio, control, flags, NULL);
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
-		ktruio->uio_resid = td->td_retval[0];
+		if (error == 0)
+			ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_sendto(struct thread *td, struct sendto_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = __DECONST(void *, uap->to);
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags = 0;
 #endif
 	aiov.iov_base = __DECONST(void *, uap->buf);
 	aiov.iov_len = uap->len;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 osend(struct thread *td, struct osend_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = __DECONST(void *, uap->buf);
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 int
 osendmsg(struct thread *td, struct osendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_sendmsg(struct thread *td, struct sendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg,
     struct mbuf **controlp)
 {
 	struct uio auio;
 	struct iovec *iov;
 	struct mbuf *control, *m;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, i;
 
 	if (controlp != NULL)
 		*controlp = NULL;
 
 	AUDIT_ARG_FD(s);
 	error = getsock(td, s, &cap_recv_rights, &fp);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	control = NULL;
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, NULL,
 	    (mp->msg_control || controlp) ? &control : NULL,
 	    &mp->msg_flags);
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	if (fromsa != NULL)
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		/* MSG_TRUNC can trigger underflow of uio_resid. */
 		ktruio->uio_resid = MIN(len - auio.uio_resid, len);
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if ((mp->msg_flags & MSG_COMPAT) != 0 &&
 			    SV_PROC_FLAG(td->td_proc, SV_AOUT))
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error != 0)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && (mp->msg_flags & MSG_COMPAT) != 0 &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT)) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		ctlbuf = mp->msg_control;
 		len = mp->msg_controllen;
 		mp->msg_controllen = 0;
 		for (m = control; m != NULL && len >= m->m_len; m = m->m_next) {
 			if ((error = copyout(mtod(m, caddr_t), ctlbuf,
 			    m->m_len)) != 0)
 				goto out;
 
 			ctlbuf += m->m_len;
 			len -= m->m_len;
 			mp->msg_controllen += m->m_len;
 		}
 		if (m != NULL) {
 			mp->msg_flags |= MSG_CTRUNC;
 			m_dispose_extcontrolm(m);
 		}
 	}
 out:
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	free(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)
 		*controlp = control;
 	else if (control != NULL) {
 		if (error != 0)
 			m_dispose_extcontrolm(control);
 		m_freem(control);
 	}
 
 	return (error);
 }
 
 static int
 recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp)
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error != 0)
 		return (error);
 	if (namelenp != NULL) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if ((mp->msg_flags & MSG_COMPAT) != 0 &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT))
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
 static int
 kern_recvfrom(struct thread *td, int s, void *buf, size_t len, int flags,
     struct sockaddr *from, socklen_t *fromlenaddr)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (fromlenaddr != NULL) {
 		error = copyin(fromlenaddr, &msg.msg_namelen,
 		    sizeof (msg.msg_namelen));
 		if (error != 0)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	msg.msg_control = 0;
 	msg.msg_flags = flags;
 	error = recvit(td, s, &msg, fromlenaddr);
 done2:
 	return (error);
 }
 
 int
 sys_recvfrom(struct thread *td, struct recvfrom_args *uap)
 {
 	return (kern_recvfrom(td, uap->s, uap->buf, uap->len,
 	    uap->flags, uap->from, uap->fromlenaddr));
 }
 
 
 #ifdef COMPAT_OLDSOCK
 int
 orecvfrom(struct thread *td, struct orecvfrom_args *uap)
 {
 	return (kern_recvfrom(td, uap->s, uap->buf, uap->len,
 	    uap->flags | MSG_COMPAT, uap->from, uap->fromlenaddr));
 }
 #endif
 
 #ifdef COMPAT_OLDSOCK
 int
 orecv(struct thread *td, struct orecv_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	return (recvit(td, uap->s, &msg, NULL));
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  */
 int
 orecvmsg(struct thread *td, struct orecvmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_recvmsg(struct thread *td, struct recvmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 sys_shutdown(struct thread *td, struct shutdown_args *uap)
 {
 
 	return (kern_shutdown(td, uap->s, uap->how));
 }
 
 int
 kern_shutdown(struct thread *td, int s, int how)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	if (__predict_false(how < SHUT_RD || how > SHUT_RDWR))
 		return (EINVAL);
 
 	AUDIT_ARG_FD(s);
 	error = getsock(td, s, &cap_shutdown_rights, &fp);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, how);
 		/*
 		 * Previous versions did not return ENOTCONN, but 0 in
 		 * case the socket was not connected. Some important
 		 * programs like syslogd up to r279016, 2015-02-19,
 		 * still depend on this behavior.
 		 */
 		if (error == ENOTCONN &&
 		    td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN)
 			error = 0;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 int
 sys_setsockopt(struct thread *td, struct setsockopt_args *uap)
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(struct thread *td, int s, int level, int name, const void *val,
     enum uio_seg valseg, socklen_t valsize)
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	int error;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = __DECONST(void *, val);
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock(td, s, &cap_setsockopt_rights, &fp);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 int
 sys_getsockopt(struct thread *td, struct getsockopt_args *uap)
 {
 	socklen_t valsize;
 	int error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error != 0)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(struct thread *td, int s, int level, int name, void *val,
     enum uio_seg valseg, socklen_t *valsize)
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	int error;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock(td, s, &cap_getsockopt_rights, &fp);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 static int
 user_getsockname(struct thread *td, int fdes, struct sockaddr *asa,
     socklen_t *alen, bool compat)
 {
 	struct sockaddr_storage ss = { .ss_len = sizeof(ss) };
 	socklen_t len;
 	int error;
 
 	error = copyin(alen, &len, sizeof(len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getsockname(td, fdes, (struct sockaddr *)&ss);
 	if (error != 0)
 		return (error);
 
 #ifdef COMPAT_OLDSOCK
 	if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		((struct osockaddr *)&ss)->sa_family = ss.ss_family;
 #endif
 	len = min(ss.ss_len, len);
 	error = copyout(&ss, asa, len);
 	if (error == 0) {
 		len = ss.ss_len;
 		error = copyout(&len, alen, sizeof(len));
 	}
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock(td, fd, &cap_getsockname_rights, &fp);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	error = sosockaddr(so, sa);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_getsockname(struct thread *td, struct getsockname_args *uap)
 {
 	return (user_getsockname(td, uap->fdes, uap->asa, uap->alen, false));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetsockname(struct thread *td, struct ogetsockname_args *uap)
 {
 	return (user_getsockname(td, uap->fdes, uap->asa, uap->alen, true));
 }
 #endif /* COMPAT_OLDSOCK */
 
 static int
 user_getpeername(struct thread *td, int fdes, struct sockaddr *asa,
     socklen_t *alen, bool compat)
 {
 	struct sockaddr_storage ss = { .ss_len = sizeof(ss) };
 	socklen_t len;
 	int error;
 
 	error = copyin(alen, &len, sizeof (len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getpeername(td, fdes, (struct sockaddr *)&ss);
 	if (error != 0)
 		return (error);
 
 #ifdef COMPAT_OLDSOCK
 	if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		((struct osockaddr *)&ss)->sa_family = ss.ss_family;
 #endif
 	len = min(ss.ss_len, len);
 	error = copyout(&ss, asa, len);
 	if (error == 0) {
 		len = ss.ss_len;
 		error = copyout(&len, alen, sizeof(len));
 	}
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock(td, fd, &cap_getpeername_rights, &fp);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
 		goto done;
 	}
 	error = sopeeraddr(so, sa);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 done:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_getpeername(struct thread *td, struct getpeername_args *uap)
 {
 	return (user_getpeername(td, uap->fdes, uap->asa, uap->alen, false));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetpeername(struct thread *td, struct ogetpeername_args *uap)
 {
 	return (user_getpeername(td, uap->fdes, uap->asa, uap->alen, true));
 }
 #endif /* COMPAT_OLDSOCK */
 
 static int
 sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type)
 {
 	struct sockaddr *sa;
 	struct mbuf *m;
 	int error;
 
 	if (buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && buflen <= 112 &&
 		    SV_CURPROC_FLAG(SV_AOUT))
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if (buflen > MCLBYTES)
 				return (EMSGSIZE);
 	}
 	m = m_get2(buflen, M_WAITOK, type, 0);
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, void *), buflen);
 	if (error != 0)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX &&
 			    SV_CURPROC_FLAG(SV_AOUT))
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len)
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	sa = malloc(len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error != 0) {
 		free(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX &&
 		    SV_CURPROC_FLAG(SV_AOUT))
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 /*
  * Dispose of externalized rights from an SCM_RIGHTS message.  This function
  * should be used in error or truncation cases to avoid leaking file descriptors
  * into the recipient's (the current thread's) table.
  */
 void
 m_dispose_extcontrolm(struct mbuf *m)
 {
 	struct cmsghdr *cm;
 	struct file *fp;
 	struct thread *td;
 	socklen_t clen, datalen;
 	int error, fd, *fds, nfd;
 
 	td = curthread;
 	for (; m != NULL; m = m->m_next) {
 		if (m->m_type != MT_EXTCONTROL)
 			continue;
 		cm = mtod(m, struct cmsghdr *);
 		clen = m->m_len;
 		while (clen > 0) {
 			if (clen < sizeof(*cm))
 				panic("%s: truncated mbuf %p", __func__, m);
 			datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0));
 			if (clen < datalen)
 				panic("%s: truncated mbuf %p", __func__, m);
 
 			if (cm->cmsg_level == SOL_SOCKET &&
 			    cm->cmsg_type == SCM_RIGHTS) {
 				fds = (int *)CMSG_DATA(cm);
 				nfd = (cm->cmsg_len - CMSG_SPACE(0)) /
 				    sizeof(int);
 
 				while (nfd-- > 0) {
 					fd = *fds++;
 					error = fget(td, fd, &cap_no_rights,
 					    &fp);
 					if (error == 0) {
 						fdclose(td, fp, fd);
 						fdrop(fp, td);
 					}
 				}
 			}
 			clen -= datalen;
 			cm = (struct cmsghdr *)((uint8_t *)cm + datalen);
 		}
 		m_chtype(m, MT_CONTROL);
 	}
 }
diff --git a/sys/netinet/sctp_syscalls.c b/sys/netinet/sctp_syscalls.c
index a5e4fa8f9603..d67e260b6f99 100644
--- a/sys/netinet/sctp_syscalls.c
+++ b/sys/netinet/sctp_syscalls.c
@@ -1,553 +1,555 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_sctp.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sf_buf.h>
 #include <sys/sysent.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <netinet/sctp.h>
 #include <netinet/sctp_os_bsd.h>
 #include <netinet/sctp_peeloff.h>
 
 static struct syscall_helper_data sctp_syscalls[] = {
 	SYSCALL_INIT_HELPER_F(sctp_peeloff, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_sendmsg, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_sendmsg_iov, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_recvmsg, SYF_CAPENABLED),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 static struct syscall_helper_data sctp32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_peeloff),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_sendmsg),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_sendmsg_iov),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_recvmsg),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 int
 sctp_syscalls_init(void)
 {
 	int error;
 
 	error = syscall_helper_register(sctp_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sctp32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 #ifdef SCTP
 SYSINIT(sctp_syscalls, SI_SUB_SYSCALLS, SI_ORDER_ANY, sctp_syscalls_init, NULL);
 #endif
 
 int
 sctp_syscalls_uninit(void)
 {
 	int error;
 
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_unregister(sctp32_syscalls);
 	if (error != 0)
 		return (error);
 #endif
 	error = syscall_helper_unregister(sctp_syscalls);
 	if (error != 0)
 		return (error);
 	return (0);
 }
 
 /*
  * SCTP syscalls.
  */
 int
 sys_sctp_peeloff(struct thread *td, struct sctp_peeloff_args *uap)
 {
 	struct file *headfp, *nfp = NULL;
 	struct socket *head, *so;
 	cap_rights_t rights;
 	u_int fflag;
 	int error, fd;
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock(td, uap->sd, cap_rights_init_one(&rights, CAP_PEELOFF),
 	    &headfp);
 	if (error != 0)
 		goto done2;
 	fflag = atomic_load_int(&headfp->f_flag);
 	head = headfp->f_data;
 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto done;
 	}
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto done;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd, 0);
 	if (error != 0)
 		goto done;
 	td->td_retval[0] = fd;
 
 	CURVNET_SET(head->so_vnet);
 	so = sopeeloff(head);
 	if (so == NULL) {
 		error = ENOMEM;
 		goto noconnection;
 	}
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(td, nfp, fd);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 	CURVNET_RESTORE();
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 done2:
 	return (error);
 }
 
 int
 sys_sctp_generic_sendmsg(struct thread *td, struct sctp_generic_sendmsg_args *uap)
 {
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 	cap_rights_t rights;
 	int error = 0, len;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 
 	cap_rights_init_one(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set_one(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock(td, uap->sd, &rights, &fp);
 	if (error != 0)
 		goto sctp_bad;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	len = auio.uio_resid = uap->mlen;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
 	    (struct mbuf *)NULL, uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
-		ktruio->uio_resid = td->td_retval[0];
+		if (error == 0)
+			ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 sys_sctp_generic_sendmsg_iov(struct thread *td, struct sctp_generic_sendmsg_iov_args *uap)
 {
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	cap_rights_t rights;
 	ssize_t len;
 	int error, i;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	cap_rights_init_one(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set_one(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock(td, uap->sd, &rights, &fp);
 	if (error != 0)
 		goto sctp_bad1;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto sctp_bad1;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	len = auio.uio_resid;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
-		ktruio->uio_resid = td->td_retval[0];
+		if (error == 0)
+			ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 sys_sctp_generic_recvmsg(struct thread *td, struct sctp_generic_recvmsg_args *uap)
 {
 	uint8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *fromsa;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, fromlen, i, msg_flags;
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock(td, uap->sd, cap_rights_init_one(&rights, CAP_RECV),
 	    &fp);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto out1;
 
 	so = fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif /* MAC */
 
 	if (uap->fromlenaddr != NULL) {
 		error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
 		if (error != 0)
 			goto out;
 	} else {
 		fromlen = 0;
 	}
 	if (uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
 	CURVNET_SET(so->so_vnet);
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (size_t)len);
 			if (error != 0)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error != 0)
 			goto out;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	if (fp != NULL)
 		fdrop(fp, td);
 
 	return (error);
 }