Index: stable/11/sys/compat/cloudabi/cloudabi_mem.c
===================================================================
--- stable/11/sys/compat/cloudabi/cloudabi_mem.c	(revision 314333)
+++ stable/11/sys/compat/cloudabi/cloudabi_mem.c	(revision 314334)
@@ -1,198 +1,178 @@
 /*-
  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/mman.h>
-#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
 
 #include <contrib/cloudabi/cloudabi_types_common.h>
 
 #include <compat/cloudabi/cloudabi_proto.h>
 
 /* Converts CloudABI's memory protection flags to FreeBSD's. */
 static int
 convert_mprot(cloudabi_mprot_t in, int *out)
 {
 
 	/* Unknown protection flags. */
 	if ((in & ~(CLOUDABI_PROT_EXEC | CLOUDABI_PROT_WRITE |
 	    CLOUDABI_PROT_READ)) != 0)
 		return (ENOTSUP);
 	/* W^X: Write and exec cannot be enabled at the same time. */
 	if ((in & (CLOUDABI_PROT_EXEC | CLOUDABI_PROT_WRITE)) ==
 	    (CLOUDABI_PROT_EXEC | CLOUDABI_PROT_WRITE))
 		return (ENOTSUP);
 
 	*out = 0;
 	if (in & CLOUDABI_PROT_EXEC)
 		*out |= PROT_EXEC;
 	if (in & CLOUDABI_PROT_WRITE)
 		*out |= PROT_WRITE;
 	if (in & CLOUDABI_PROT_READ)
 		*out |= PROT_READ;
 	return (0);
 }
 
 int
 cloudabi_sys_mem_advise(struct thread *td,
     struct cloudabi_sys_mem_advise_args *uap)
 {
-	struct madvise_args madvise_args = {
-		.addr	= uap->addr,
-		.len	= uap->len
-	};
+	int behav;
 
 	switch (uap->advice) {
 	case CLOUDABI_ADVICE_DONTNEED:
-		madvise_args.behav = MADV_DONTNEED;
+		behav = MADV_DONTNEED;
 		break;
 	case CLOUDABI_ADVICE_NORMAL:
-		madvise_args.behav = MADV_NORMAL;
+		behav = MADV_NORMAL;
 		break;
 	case CLOUDABI_ADVICE_RANDOM:
-		madvise_args.behav = MADV_RANDOM;
+		behav = MADV_RANDOM;
 		break;
 	case CLOUDABI_ADVICE_SEQUENTIAL:
-		madvise_args.behav = MADV_SEQUENTIAL;
+		behav = MADV_SEQUENTIAL;
 		break;
 	case CLOUDABI_ADVICE_WILLNEED:
-		madvise_args.behav = MADV_WILLNEED;
+		behav = MADV_WILLNEED;
 		break;
 	default:
 		return (EINVAL);
 	}
 
-	return (sys_madvise(td, &madvise_args));
+	return (kern_madvise(td, (uintptr_t)uap->addr, uap->len,
+	    behav));
 }
 
 int
 cloudabi_sys_mem_lock(struct thread *td, struct cloudabi_sys_mem_lock_args *uap)
 {
-	struct mlock_args mlock_args = {
-		.addr	= uap->addr,
-		.len	= uap->len
-	};
 
-	return (sys_mlock(td, &mlock_args));
+	return (kern_mlock(td->td_proc, td->td_ucred,
+	    __DECONST(uintptr_t, uap->addr), uap->len));
 }
 
 int
 cloudabi_sys_mem_map(struct thread *td, struct cloudabi_sys_mem_map_args *uap)
 {
-	struct mmap_args mmap_args = {
-		.addr	= uap->addr,
-		.len	= uap->len,
-		.fd	= uap->fd,
-		.pos	= uap->off
-	};
-	int error;
+	int error, flags, prot;
 
 	/* Translate flags. */
+	flags = 0;
 	if (uap->flags & CLOUDABI_MAP_ANON)
-		mmap_args.flags |= MAP_ANON;
+		flags |= MAP_ANON;
 	if (uap->flags & CLOUDABI_MAP_FIXED)
-		mmap_args.flags |= MAP_FIXED;
+		flags |= MAP_FIXED;
 	if (uap->flags & CLOUDABI_MAP_PRIVATE)
-		mmap_args.flags |= MAP_PRIVATE;
+		flags |= MAP_PRIVATE;
 	if (uap->flags & CLOUDABI_MAP_SHARED)
-		mmap_args.flags |= MAP_SHARED;
+		flags |= MAP_SHARED;
 
 	/* Translate protection. */
-	error = convert_mprot(uap->prot, &mmap_args.prot);
+	error = convert_mprot(uap->prot, &prot);
 	if (error != 0)
 		return (error);
 
-	return (sys_mmap(td, &mmap_args));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
+	    uap->fd, uap->off));
 }
 
 int
 cloudabi_sys_mem_protect(struct thread *td,
     struct cloudabi_sys_mem_protect_args *uap)
 {
-	struct mprotect_args mprotect_args = {
-		.addr	= uap->addr,
-		.len	= uap->len,
-	};
-	int error;
+	int error, prot;
 
 	/* Translate protection. */
-	error = convert_mprot(uap->prot, &mprotect_args.prot);
+	error = convert_mprot(uap->prot, &prot);
 	if (error != 0)
 		return (error);
 
-	return (sys_mprotect(td, &mprotect_args));
+	return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len,
+	    prot));
 }
 
 int
 cloudabi_sys_mem_sync(struct thread *td, struct cloudabi_sys_mem_sync_args *uap)
 {
-	struct msync_args msync_args = {
-		.addr	= uap->addr,
-		.len	= uap->len,
-	};
+	int flags;
 
 	/* Convert flags. */
 	switch (uap->flags & (CLOUDABI_MS_ASYNC | CLOUDABI_MS_SYNC)) {
 	case CLOUDABI_MS_ASYNC:
-		msync_args.flags |= MS_ASYNC;
+		flags = MS_ASYNC;
 		break;
 	case CLOUDABI_MS_SYNC:
-		msync_args.flags |= MS_SYNC;
+		flags = MS_SYNC;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((uap->flags & CLOUDABI_MS_INVALIDATE) != 0)
-		msync_args.flags |= MS_INVALIDATE;
+		flags |= MS_INVALIDATE;
 
-	return (sys_msync(td, &msync_args));
+	return (kern_msync(td, (uintptr_t)uap->addr, uap->len,
+	    flags));
 }
 
 int
 cloudabi_sys_mem_unlock(struct thread *td,
     struct cloudabi_sys_mem_unlock_args *uap)
 {
-	struct munlock_args munlock_args = {
-		.addr	= uap->addr,
-		.len	= uap->len
-	};
 
-	return (sys_munlock(td, &munlock_args));
+	return (kern_munlock(td, __DECONST(uintptr_t, uap->addr),
+	    uap->len));
 }
 
 int
 cloudabi_sys_mem_unmap(struct thread *td,
     struct cloudabi_sys_mem_unmap_args *uap)
 {
-	struct munmap_args munmap_args = {
-		.addr	= uap->addr,
-		.len	= uap->len
-	};
 
-	return (sys_munmap(td, &munmap_args));
+	return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
 }
Index: stable/11/sys/compat/freebsd32/freebsd32_misc.c
===================================================================
--- stable/11/sys/compat/freebsd32/freebsd32_misc.c	(revision 314333)
+++ stable/11/sys/compat/freebsd32/freebsd32_misc.c	(revision 314334)
@@ -1,3173 +1,3160 @@
 /*-
  * Copyright (c) 2002 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #define __ELF_WORD_SIZE 32
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/clock.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/imgact.h>
 #include <sys/mbuf.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/eventvar.h>	/* Must come after sys/selinfo.h */
 #include <sys/pipe.h>		/* Must come after sys/selinfo.h */
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/thr.h>
 #include <sys/unistd.h>
 #include <sys/ucontext.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/ipc.h>
 #include <sys/msg.h>
 #include <sys/sem.h>
 #include <sys/shm.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/elf.h>
 
 #include <security/audit/audit.h>
 
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_misc.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 FEATURE(compat_freebsd_32bit, "Compatible with 32-bit FreeBSD");
 
 #ifndef __mips__
 CTASSERT(sizeof(struct timeval32) == 8);
 CTASSERT(sizeof(struct timespec32) == 8);
 CTASSERT(sizeof(struct itimerval32) == 16);
 #endif
 CTASSERT(sizeof(struct statfs32) == 256);
 #ifndef __mips__
 CTASSERT(sizeof(struct rusage32) == 72);
 #endif
 CTASSERT(sizeof(struct sigaltstack32) == 12);
 CTASSERT(sizeof(struct kevent32) == 20);
 CTASSERT(sizeof(struct iovec32) == 8);
 CTASSERT(sizeof(struct msghdr32) == 28);
 #ifndef __mips__
 CTASSERT(sizeof(struct stat32) == 96);
 #endif
 CTASSERT(sizeof(struct sigaction32) == 24);
 
 static int freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count);
 
 void
 freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32)
 {
 
 	TV_CP(*s, *s32, ru_utime);
 	TV_CP(*s, *s32, ru_stime);
 	CP(*s, *s32, ru_maxrss);
 	CP(*s, *s32, ru_ixrss);
 	CP(*s, *s32, ru_idrss);
 	CP(*s, *s32, ru_isrss);
 	CP(*s, *s32, ru_minflt);
 	CP(*s, *s32, ru_majflt);
 	CP(*s, *s32, ru_nswap);
 	CP(*s, *s32, ru_inblock);
 	CP(*s, *s32, ru_oublock);
 	CP(*s, *s32, ru_msgsnd);
 	CP(*s, *s32, ru_msgrcv);
 	CP(*s, *s32, ru_nsignals);
 	CP(*s, *s32, ru_nvcsw);
 	CP(*s, *s32, ru_nivcsw);
 }
 
 int
 freebsd32_wait4(struct thread *td, struct freebsd32_wait4_args *uap)
 {
 	int error, status;
 	struct rusage32 ru32;
 	struct rusage ru, *rup;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (error)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0) {
 		freebsd32_rusage_out(&ru, &ru32);
 		error = copyout(&ru32, uap->rusage, sizeof(ru32));
 	}
 	return (error);
 }
 
 int
 freebsd32_wait6(struct thread *td, struct freebsd32_wait6_args *uap)
 {
 	struct wrusage32 wru32;
 	struct __wrusage wru, *wrup;
 	struct siginfo32 si32;
 	struct __siginfo si, *sip;
 	int error, status;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 	error = kern_wait6(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    &status, uap->options, wrup, sip);
 	if (error != 0)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0) {
 		freebsd32_rusage_out(&wru.wru_self, &wru32.wru_self);
 		freebsd32_rusage_out(&wru.wru_children, &wru32.wru_children);
 		error = copyout(&wru32, uap->wrusage, sizeof(wru32));
 	}
 	if (uap->info != NULL && error == 0) {
 		siginfo_to_siginfo32 (&si, &si32);
 		error = copyout(&si32, uap->info, sizeof(si32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 static void
 copy_statfs(struct statfs *in, struct statfs32 *out)
 {
 
 	statfs_scale_blocks(in, INT32_MAX);
 	bzero(out, sizeof(*out));
 	CP(*in, *out, f_bsize);
 	out->f_iosize = MIN(in->f_iosize, INT32_MAX);
 	CP(*in, *out, f_blocks);
 	CP(*in, *out, f_bfree);
 	CP(*in, *out, f_bavail);
 	out->f_files = MIN(in->f_files, INT32_MAX);
 	out->f_ffree = MIN(in->f_ffree, INT32_MAX);
 	CP(*in, *out, f_fsid);
 	CP(*in, *out, f_owner);
 	CP(*in, *out, f_type);
 	CP(*in, *out, f_flags);
 	out->f_syncwrites = MIN(in->f_syncwrites, INT32_MAX);
 	out->f_asyncwrites = MIN(in->f_asyncwrites, INT32_MAX);
 	strlcpy(out->f_fstypename,
 	      in->f_fstypename, MFSNAMELEN);
 	strlcpy(out->f_mntonname,
 	      in->f_mntonname, min(MNAMELEN, FREEBSD4_MNAMELEN));
 	out->f_syncreads = MIN(in->f_syncreads, INT32_MAX);
 	out->f_asyncreads = MIN(in->f_asyncreads, INT32_MAX);
 	strlcpy(out->f_mntfromname,
 	      in->f_mntfromname, min(MNAMELEN, FREEBSD4_MNAMELEN));
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_getfsstat(struct thread *td,
     struct freebsd4_freebsd32_getfsstat_args *uap)
 {
 	struct statfs *buf, *sp;
 	struct statfs32 stat32;
 	size_t count, size, copycount;
 	int error;
 
 	count = uap->bufsize / sizeof(struct statfs32);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode);
 	if (size > 0) {
 		sp = buf;
 		copycount = count;
 		while (copycount > 0 && error == 0) {
 			copy_statfs(sp, &stat32);
 			error = copyout(&stat32, uap->buf, sizeof(stat32));
 			sp++;
 			uap->buf++;
 			copycount--;
 		}
 		free(buf, M_STATFS);
 	}
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD10
 int
 freebsd10_freebsd32_pipe(struct thread *td,
     struct freebsd10_freebsd32_pipe_args *uap) {
 	
 	return (freebsd10_pipe(td, (struct freebsd10_pipe_args*)uap));
 }
 #endif
 
 int
 freebsd32_sigaltstack(struct thread *td,
 		      struct freebsd32_sigaltstack_args *uap)
 {
 	struct sigaltstack32 s32;
 	struct sigaltstack ss, oss, *ssp;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		PTRIN_CP(s32, ss, ss_sp);
 		CP(s32, ss, ss_size);
 		CP(s32, ss, ss_flags);
 		ssp = &ss;
 	} else
 		ssp = NULL;
 	error = kern_sigaltstack(td, ssp, &oss);
 	if (error == 0 && uap->oss != NULL) {
 		PTROUT_CP(oss, s32, ss_sp);
 		CP(oss, s32, ss_size);
 		CP(oss, s32, ss_flags);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 
 /*
  * Custom version of exec_copyin_args() so that we can translate
  * the pointers.
  */
 int
 freebsd32_exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, u_int32_t *argv, u_int32_t *envv)
 {
 	char *argp, *envp;
 	u_int32_t *p32, arg;
 	size_t length;
 	int error;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = (segflg == UIO_SYSSPACE) ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			goto err_exit;
 	} else
 		length = 0;
 
 	args->begin_argv = args->buf + length;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	/*
 	 * extract arguments first
 	 */
 	p32 = argv;
 	for (;;) {
 		error = copyin(p32++, &arg, sizeof(arg));
 		if (error)
 			goto err_exit;
 		if (arg == 0)
 			break;
 		argp = PTRIN(arg);
 		error = copyinstr(argp, args->endp, args->stringspace, &length);
 		if (error) {
 			if (error == ENAMETOOLONG)
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 			
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		p32 = envv;
 		for (;;) {
 			error = copyin(p32++, &arg, sizeof(arg));
 			if (error)
 				goto err_exit;
 			if (arg == 0)
 				break;
 			envp = PTRIN(arg);
 			error = copyinstr(envp, args->endp, args->stringspace,
 			    &length);
 			if (error) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 freebsd32_execve(struct thread *td, struct freebsd32_execve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 int
 freebsd32_fexecve(struct thread *td, struct freebsd32_fexecve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		eargs.fd = uap->fd;
 		error = kern_execve(td, &eargs, NULL);
 	}
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 int
 freebsd32_mprotect(struct thread *td, struct freebsd32_mprotect_args *uap)
 {
-	struct mprotect_args ap;
+	int prot;
 
-	ap.addr = PTRIN(uap->addr);
-	ap.len = uap->len;
-	ap.prot = uap->prot;
+	prot = uap->prot;
 #if defined(__amd64__)
-	if (i386_read_exec && (ap.prot & PROT_READ) != 0)
-		ap.prot |= PROT_EXEC;
+	if (i386_read_exec && (prot & PROT_READ) != 0)
+		prot |= PROT_EXEC;
 #endif
-	return (sys_mprotect(td, &ap));
+	return (kern_mprotect(td, (uintptr_t)PTRIN(uap->addr), uap->len,
+	    prot));
 }
 
 int
 freebsd32_mmap(struct thread *td, struct freebsd32_mmap_args *uap)
 {
-	struct mmap_args ap;
-	vm_offset_t addr = (vm_offset_t) uap->addr;
-	vm_size_t len	 = uap->len;
-	int prot	 = uap->prot;
-	int flags	 = uap->flags;
-	int fd		 = uap->fd;
-	off_t pos	 = PAIR32TO64(off_t,uap->pos);
+	int prot;
 
+	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ))
 		prot |= PROT_EXEC;
 #endif
 
-	ap.addr = (void *) addr;
-	ap.len = len;
-	ap.prot = prot;
-	ap.flags = flags;
-	ap.fd = fd;
-	ap.pos = pos;
-
-	return (sys_mmap(td, &ap));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot,
+	    uap->flags, uap->fd, PAIR32TO64(off_t, uap->pos)));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_mmap(struct thread *td,
     struct freebsd6_freebsd32_mmap_args *uap)
 {
-	struct freebsd32_mmap_args ap;
+	int prot;
 
-	ap.addr = uap->addr;
-	ap.len = uap->len;
-	ap.prot = uap->prot;
-	ap.flags = uap->flags;
-	ap.fd = uap->fd;
-	ap.pos1 = uap->pos1;
-	ap.pos2 = uap->pos2;
+	prot = uap->prot;
+#if defined(__amd64__)
+	if (i386_read_exec && (prot & PROT_READ))
+		prot |= PROT_EXEC;
+#endif
 
-	return (freebsd32_mmap(td, &ap));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot,
+	    uap->flags, uap->fd, PAIR32TO64(off_t, uap->pos)));
 }
 #endif
 
 int
 freebsd32_setitimer(struct thread *td, struct freebsd32_setitimer_args *uap)
 {
 	struct itimerval itv, oitv, *itvp;	
 	struct itimerval32 i32;
 	int error;
 
 	if (uap->itv != NULL) {
 		error = copyin(uap->itv, &i32, sizeof(i32));
 		if (error)
 			return (error);
 		TV_CP(i32, itv, it_interval);
 		TV_CP(i32, itv, it_value);
 		itvp = &itv;
 	} else
 		itvp = NULL;
 	error = kern_setitimer(td, uap->which, itvp, &oitv);
 	if (error || uap->oitv == NULL)
 		return (error);
 	TV_CP(oitv, i32, it_interval);
 	TV_CP(oitv, i32, it_value);
 	return (copyout(&i32, uap->oitv, sizeof(i32)));
 }
 
 int
 freebsd32_getitimer(struct thread *td, struct freebsd32_getitimer_args *uap)
 {
 	struct itimerval itv;
 	struct itimerval32 i32;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &itv);
 	if (error || uap->itv == NULL)
 		return (error);
 	TV_CP(itv, i32, it_interval);
 	TV_CP(itv, i32, it_value);
 	return (copyout(&i32, uap->itv, sizeof(i32)));
 }
 
 int
 freebsd32_select(struct thread *td, struct freebsd32_select_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    sizeof(int32_t) * 8));
 }
 
 int
 freebsd32_pselect(struct thread *td, struct freebsd32_pselect_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	error = kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, sizeof(int32_t) * 8);
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	int i, error = 0;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		CP(kevp[i], ks32[i], ident);
 		CP(kevp[i], ks32[i], filter);
 		CP(kevp[i], ks32[i], flags);
 		CP(kevp[i], ks32[i], fflags);
 		CP(kevp[i], ks32[i], data);
 		PTROUT_CP(kevp[i], ks32[i], udata);
 	}
 	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	int i, error = 0;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
 	if (error)
 		goto done;
 	uap->changelist += count;
 
 	for (i = 0; i < count; i++) {
 		CP(ks32[i], kevp[i], ident);
 		CP(ks32[i], kevp[i], filter);
 		CP(ks32[i], kevp[i], flags);
 		CP(ks32[i], kevp[i], fflags);
 		CP(ks32[i], kevp[i], data);
 		PTRIN_CP(ks32[i], kevp[i], udata);
 	}
 done:
 	return (error);
 }
 
 int
 freebsd32_kevent(struct thread *td, struct freebsd32_kevent_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = { uap,
 					freebsd32_kevent_copyout,
 					freebsd32_kevent_copyin};
 	int error;
 
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 	return (error);
 }
 
 int
 freebsd32_gettimeofday(struct thread *td,
 		       struct freebsd32_gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timeval32 atv32;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		CP(atv, atv32, tv_sec);
 		CP(atv, atv32, tv_usec);
 		error = copyout(&atv32, uap->tp, sizeof (atv32));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = tz_minuteswest;
 		rtz.tz_dsttime = tz_dsttime;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 int
 freebsd32_getrusage(struct thread *td, struct freebsd32_getrusage_args *uap)
 {
 	struct rusage32 s32;
 	struct rusage s;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &s);
 	if (error)
 		return (error);
 	if (uap->rusage != NULL) {
 		freebsd32_rusage_out(&s, &s32);
 		error = copyout(&s32, uap->rusage, sizeof(s32));
 	}
 	return (error);
 }
 
 static int
 freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	struct uio *uio;
 	u_int iovlen;
 	int error, i;
 
 	*uiop = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (EINVAL);
 	iovlen = iovcnt * sizeof(struct iovec);
 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
 	iov = (struct iovec *)(uio + 1);
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
 		if (error) {
 			free(uio, M_IOV);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
 	uio->uio_iov = iov;
 	uio->uio_iovcnt = iovcnt;
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
 			free(uio, M_IOV);
 			return (EINVAL);
 		}
 		uio->uio_resid += iov->iov_len;
 		iov++;
 	}
 	*uiop = uio;
 	return (0);
 }
 
 int
 freebsd32_readv(struct thread *td, struct freebsd32_readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_writev(struct thread *td, struct freebsd32_writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_preadv(struct thread *td, struct freebsd32_preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_pwritev(struct thread *td, struct freebsd32_pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_copyiniov(struct iovec32 *iovp32, u_int iovcnt, struct iovec **iovp,
     int error)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	u_int iovlen;
 	int i;
 
 	*iovp = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (error);
 	iovlen = iovcnt * sizeof(struct iovec);
 	iov = malloc(iovlen, M_IOV, M_WAITOK);
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp32[i], &iov32, sizeof(struct iovec32));
 		if (error) {
 			free(iov, M_IOV);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
 	*iovp = iov;
 	return (0);
 }
 
 static int
 freebsd32_copyinmsghdr(struct msghdr32 *msg32, struct msghdr *msg)
 {
 	struct msghdr32 m32;
 	int error;
 
 	error = copyin(msg32, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	msg->msg_name = PTRIN(m32.msg_name);
 	msg->msg_namelen = m32.msg_namelen;
 	msg->msg_iov = PTRIN(m32.msg_iov);
 	msg->msg_iovlen = m32.msg_iovlen;
 	msg->msg_control = PTRIN(m32.msg_control);
 	msg->msg_controllen = m32.msg_controllen;
 	msg->msg_flags = m32.msg_flags;
 	return (0);
 }
 
 static int
 freebsd32_copyoutmsghdr(struct msghdr *msg, struct msghdr32 *msg32)
 {
 	struct msghdr32 m32;
 	int error;
 
 	m32.msg_name = PTROUT(msg->msg_name);
 	m32.msg_namelen = msg->msg_namelen;
 	m32.msg_iov = PTROUT(msg->msg_iov);
 	m32.msg_iovlen = msg->msg_iovlen;
 	m32.msg_control = PTROUT(msg->msg_control);
 	m32.msg_controllen = msg->msg_controllen;
 	m32.msg_flags = msg->msg_flags;
 	error = copyout(&m32, msg32, sizeof(m32));
 	return (error);
 }
 
 #ifndef __mips__
 #define FREEBSD32_ALIGNBYTES	(sizeof(int) - 1)
 #else
 #define FREEBSD32_ALIGNBYTES	(sizeof(long) - 1)
 #endif
 #define FREEBSD32_ALIGN(p)	\
 	(((u_long)(p) + FREEBSD32_ALIGNBYTES) & ~FREEBSD32_ALIGNBYTES)
 #define	FREEBSD32_CMSG_SPACE(l)	\
 	(FREEBSD32_ALIGN(sizeof(struct cmsghdr)) + FREEBSD32_ALIGN(l))
 
 #define	FREEBSD32_CMSG_DATA(cmsg)	((unsigned char *)(cmsg) + \
 				 FREEBSD32_ALIGN(sizeof(struct cmsghdr)))
 static int
 freebsd32_copy_msg_out(struct msghdr *msg, struct mbuf *control)
 {
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen;
 	int error;
 	caddr_t ctlbuf;
 	int len, maxlen, copylen;
 	struct mbuf *m;
 	error = 0;
 
 	len    = msg->msg_controllen;
 	maxlen = msg->msg_controllen;
 	msg->msg_controllen = 0;
 
 	m = control;
 	ctlbuf = msg->msg_control;
       
 	while (m && len > 0) {
 		cm = mtod(m, struct cmsghdr *);
 		clen = m->m_len;
 
 		while (cm != NULL) {
 
 			if (sizeof(struct cmsghdr) > clen ||
 			    cm->cmsg_len > clen) {
 				error = EINVAL;
 				break;
 			}	
 
 			data   = CMSG_DATA(cm);
 			datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 			/* Adjust message length */
 			cm->cmsg_len = FREEBSD32_ALIGN(sizeof(struct cmsghdr)) +
 			    datalen;
 
 
 			/* Copy cmsghdr */
 			copylen = sizeof(struct cmsghdr);
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				copylen = len;
 			}
 
 			error = copyout(cm,ctlbuf,copylen);
 			if (error)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			if (len <= 0)
 				break;
 
 			/* Copy data */
 			copylen = datalen;
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				copylen = len;
 			}
 
 			error = copyout(data,ctlbuf,copylen);
 			if (error)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			if (CMSG_SPACE(datalen) < clen) {
 				clen -= CMSG_SPACE(datalen);
 				cm = (struct cmsghdr *)
 					((caddr_t)cm + CMSG_SPACE(datalen));
 			} else {
 				clen = 0;
 				cm = NULL;
 			}
 		}	
 		m = m->m_next;
 	}
 
 	msg->msg_controllen = (len <= 0) ? maxlen :  ctlbuf - (caddr_t)msg->msg_control;
 	
 exit:
 	return (error);
 
 }
 
 int
 freebsd32_recvmsg(td, uap)
 	struct thread *td;
 	struct freebsd32_recvmsg_args /* {
 		int	s;
 		struct	msghdr32 *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct msghdr32 m32;
 	struct iovec *uiov, *iov;
 	struct mbuf *control = NULL;
 	struct mbuf **controlp;
 
 	int error;
 	error = copyin(uap->msg, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 
 	controlp = (msg.msg_control != NULL) ?  &control : NULL;
 	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, controlp);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		
 		if (control != NULL)
 			error = freebsd32_copy_msg_out(&msg, control);
 		else
 			msg.msg_controllen = 0;
 		
 		if (error == 0)
 			error = freebsd32_copyoutmsghdr(&msg, uap->msg);
 	}
 	free(iov, M_IOV);
 
 	if (control != NULL)
 		m_freem(control);
 
 	return (error);
 }
 
 /*
  * Copy-in the array of control messages constructed using alignment
  * and padding suitable for a 32-bit environment and construct an
  * mbuf using alignment and padding suitable for a 64-bit kernel.
  * The alignment and padding are defined indirectly by CMSG_DATA(),
  * CMSG_SPACE() and CMSG_LEN().
  */
 static int
 freebsd32_copyin_control(struct mbuf **mp, caddr_t buf, u_int buflen)
 {
 	struct mbuf *m;
 	void *md;
 	u_int idx, len, msglen;
 	int error;
 
 	buflen = FREEBSD32_ALIGN(buflen);
 
 	if (buflen > MCLBYTES)
 		return (EINVAL);
 
 	/*
 	 * Iterate over the buffer and get the length of each message
 	 * in there. This has 32-bit alignment and padding. Use it to
 	 * determine the length of these messages when using 64-bit
 	 * alignment and padding.
 	 */
 	idx = 0;
 	len = 0;
 	while (idx < buflen) {
 		error = copyin(buf + idx, &msglen, sizeof(msglen));
 		if (error)
 			return (error);
 		if (msglen < sizeof(struct cmsghdr))
 			return (EINVAL);
 		msglen = FREEBSD32_ALIGN(msglen);
 		if (idx + msglen > buflen)
 			return (EINVAL);
 		idx += msglen;
 		msglen += CMSG_ALIGN(sizeof(struct cmsghdr)) -
 		    FREEBSD32_ALIGN(sizeof(struct cmsghdr));
 		len += CMSG_ALIGN(msglen);
 	}
 
 	if (len > MCLBYTES)
 		return (EINVAL);
 
 	m = m_get(M_WAITOK, MT_CONTROL);
 	if (len > MLEN)
 		MCLGET(m, M_WAITOK);
 	m->m_len = len;
 
 	md = mtod(m, void *);
 	while (buflen > 0) {
 		error = copyin(buf, md, sizeof(struct cmsghdr));
 		if (error)
 			break;
 		msglen = *(u_int *)md;
 		msglen = FREEBSD32_ALIGN(msglen);
 
 		/* Modify the message length to account for alignment. */
 		*(u_int *)md = msglen + CMSG_ALIGN(sizeof(struct cmsghdr)) -
 		    FREEBSD32_ALIGN(sizeof(struct cmsghdr));
 
 		md = (char *)md + CMSG_ALIGN(sizeof(struct cmsghdr));
 		buf += FREEBSD32_ALIGN(sizeof(struct cmsghdr));
 		buflen -= FREEBSD32_ALIGN(sizeof(struct cmsghdr));
 
 		msglen -= FREEBSD32_ALIGN(sizeof(struct cmsghdr));
 		if (msglen > 0) {
 			error = copyin(buf, md, msglen);
 			if (error)
 				break;
 			md = (char *)md + CMSG_ALIGN(msglen);
 			buf += msglen;
 			buflen -= msglen;
 		}
 	}
 
 	if (error)
 		m_free(m);
 	else
 		*mp = m;
 	return (error);
 }
 
 int
 freebsd32_sendmsg(struct thread *td,
 		  struct freebsd32_sendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct msghdr32 m32;
 	struct iovec *iov;
 	struct mbuf *control = NULL;
 	struct sockaddr *to = NULL;
 	int error;
 
 	error = copyin(uap->msg, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	if (msg.msg_name != NULL) {
 		error = getsockaddr(&to, msg.msg_name, msg.msg_namelen);
 		if (error) {
 			to = NULL;
 			goto out;
 		}
 		msg.msg_name = to;
 	}
 
 	if (msg.msg_control) {
 		if (msg.msg_controllen < sizeof(struct cmsghdr)) {
 			error = EINVAL;
 			goto out;
 		}
 
 		error = freebsd32_copyin_control(&control, msg.msg_control,
 		    msg.msg_controllen);
 		if (error)
 			goto out;
 
 		msg.msg_control = NULL;
 		msg.msg_controllen = 0;
 	}
 
 	error = kern_sendit(td, uap->s, &msg, uap->flags, control,
 	    UIO_USERSPACE);
 
 out:
 	free(iov, M_IOV);
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 }
 
 int
 freebsd32_recvfrom(struct thread *td,
 		   struct freebsd32_recvfrom_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(PTRIN(uap->fromlenaddr), &msg.msg_namelen,
 		    sizeof(msg.msg_namelen));
 		if (error)
 			return (error);
 	} else {
 		msg.msg_namelen = 0;
 	}
 
 	msg.msg_name = PTRIN(uap->from);
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = PTRIN(uap->buf);
 	aiov.iov_len = uap->len;
 	msg.msg_control = NULL;
 	msg.msg_flags = uap->flags;
 	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, NULL);
 	if (error == 0 && uap->fromlenaddr)
 		error = copyout(&msg.msg_namelen, PTRIN(uap->fromlenaddr),
 		    sizeof (msg.msg_namelen));
 	return (error);
 }
 
 int
 freebsd32_settimeofday(struct thread *td,
 		       struct freebsd32_settimeofday_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	struct timezone tz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &tz, sizeof(tz));
 		if (error)
 			return (error);
 		tzp = &tz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 freebsd32_utimes(struct thread *td, struct freebsd32_utimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_lutimes(struct thread *td, struct freebsd32_lutimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimes(struct thread *td, struct freebsd32_futimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_futimes(td, uap->fd, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimesat(struct thread *td, struct freebsd32_futimesat_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 		sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimens(struct thread *td, struct freebsd32_futimens_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_futimens(td, uap->fd, tsp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_utimensat(struct thread *td, struct freebsd32_utimensat_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    tsp, UIO_SYSSPACE, uap->flag));
 }
 
 int
 freebsd32_adjtime(struct thread *td, struct freebsd32_adjtime_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, delta, tv_sec);
 		CP(tv32, delta, tv_usec);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0) {
 		CP(olddelta, tv32, tv_sec);
 		CP(olddelta, tv32, tv_usec);
 		error = copyout(&tv32, uap->olddelta, sizeof(tv32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_statfs(struct thread *td, struct freebsd4_freebsd32_statfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fstatfs(struct thread *td, struct freebsd4_freebsd32_fstatfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fhstatfs(struct thread *td, struct freebsd4_freebsd32_fhstatfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	fhandle_t fh;
 	int error;
 
 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
 		return (error);
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 int
 freebsd32_pread(struct thread *td, struct freebsd32_pread_args *uap)
 {
 	struct pread_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.nbyte = uap->nbyte;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	return (sys_pread(td, &ap));
 }
 
 int
 freebsd32_pwrite(struct thread *td, struct freebsd32_pwrite_args *uap)
 {
 	struct pwrite_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.nbyte = uap->nbyte;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	return (sys_pwrite(td, &ap));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lseek(struct thread *td, struct ofreebsd32_lseek_args *uap)
 {
 	struct lseek_args nuap;
 
 	nuap.fd = uap->fd;
 	nuap.offset = uap->offset;
 	nuap.whence = uap->whence;
 	return (sys_lseek(td, &nuap));
 }
 #endif
 
 int
 freebsd32_lseek(struct thread *td, struct freebsd32_lseek_args *uap)
 {
 	int error;
 	struct lseek_args ap;
 	off_t pos;
 
 	ap.fd = uap->fd;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	ap.whence = uap->whence;
 	error = sys_lseek(td, &ap);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = td->td_uretoff.tdu_off;
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd32_truncate(struct thread *td, struct freebsd32_truncate_args *uap)
 {
 	struct truncate_args ap;
 
 	ap.path = uap->path;
 	ap.length = PAIR32TO64(off_t,uap->length);
 	return (sys_truncate(td, &ap));
 }
 
 int
 freebsd32_ftruncate(struct thread *td, struct freebsd32_ftruncate_args *uap)
 {
 	struct ftruncate_args ap;
 
 	ap.fd = uap->fd;
 	ap.length = PAIR32TO64(off_t,uap->length);
 	return (sys_ftruncate(td, &ap));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_getdirentries(struct thread *td,
     struct ofreebsd32_getdirentries_args *uap)
 {
 	struct ogetdirentries_args ap;
 	int error;
 	long loff;
 	int32_t loff_cut;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	error = kern_ogetdirentries(td, &ap, &loff);
 	if (error == 0) {
 		loff_cut = loff;
 		error = copyout(&loff_cut, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_getdirentries(struct thread *td,
     struct freebsd32_getdirentries_args *uap)
 {
 	long base;
 	int32_t base32;
 	int error;
 
 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
 	    NULL, UIO_USERSPACE);
 	if (error)
 		return (error);
 	if (uap->basep != NULL) {
 		base32 = base;
 		error = copyout(&base32, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 /* versions with the 'int pad' argument */
 int
 freebsd6_freebsd32_pread(struct thread *td, struct freebsd6_freebsd32_pread_args *uap)
 {
 	struct pread_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.nbyte = uap->nbyte;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	return (sys_pread(td, &ap));
 }
 
 int
 freebsd6_freebsd32_pwrite(struct thread *td, struct freebsd6_freebsd32_pwrite_args *uap)
 {
 	struct pwrite_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.nbyte = uap->nbyte;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	return (sys_pwrite(td, &ap));
 }
 
 int
 freebsd6_freebsd32_lseek(struct thread *td, struct freebsd6_freebsd32_lseek_args *uap)
 {
 	int error;
 	struct lseek_args ap;
 	off_t pos;
 
 	ap.fd = uap->fd;
 	ap.offset = PAIR32TO64(off_t,uap->offset);
 	ap.whence = uap->whence;
 	error = sys_lseek(td, &ap);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = *(off_t *)(td->td_retval);
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd6_freebsd32_truncate(struct thread *td, struct freebsd6_freebsd32_truncate_args *uap)
 {
 	struct truncate_args ap;
 
 	ap.path = uap->path;
 	ap.length = PAIR32TO64(off_t,uap->length);
 	return (sys_truncate(td, &ap));
 }
 
 int
 freebsd6_freebsd32_ftruncate(struct thread *td, struct freebsd6_freebsd32_ftruncate_args *uap)
 {
 	struct ftruncate_args ap;
 
 	ap.fd = uap->fd;
 	ap.length = PAIR32TO64(off_t,uap->length);
 	return (sys_ftruncate(td, &ap));
 }
 #endif /* COMPAT_FREEBSD6 */
 
 struct sf_hdtr32 {
 	uint32_t headers;
 	int hdr_cnt;
 	uint32_t trailers;
 	int trl_cnt;
 };
 
 static int
 freebsd32_do_sendfile(struct thread *td,
     struct freebsd32_sendfile_args *uap, int compat)
 {
 	struct sf_hdtr32 hdtr32;
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	cap_rights_t rights;
 	struct iovec32 *iov32;
 	off_t offset, sbytes;
 	int error;
 
 	offset = PAIR32TO64(off_t, uap->offset);
 	if (offset < 0)
 		return (EINVAL);
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr32, sizeof(hdtr32));
 		if (error)
 			goto out;
 		PTRIN_CP(hdtr32, hdtr, headers);
 		CP(hdtr32, hdtr, hdr_cnt);
 		PTRIN_CP(hdtr32, hdtr, trailers);
 		CP(hdtr32, hdtr, trl_cnt);
 
 		if (hdtr.headers != NULL) {
 			iov32 = PTRIN(hdtr32.headers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			iov32 = PTRIN(hdtr32.trailers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	if ((error = fget_read(td, uap->fd,
 	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	if (hdr_uio)
 		free(hdr_uio, M_IOV);
 	if (trl_uio)
 		free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sendfile(struct thread *td,
     struct freebsd4_freebsd32_sendfile_args *uap)
 {
 	return (freebsd32_do_sendfile(td,
 	    (struct freebsd32_sendfile_args *)uap, 1));
 }
 #endif
 
 int
 freebsd32_sendfile(struct thread *td, struct freebsd32_sendfile_args *uap)
 {
 
 	return (freebsd32_do_sendfile(td, uap, 0));
 }
 
 static void
 copy_stat(struct stat *in, struct stat32 *out)
 {
 
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_size);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 	TS_CP(*in, *out, st_birthtim);
 }
 
 #ifdef COMPAT_43
 static void
 copy_ostat(struct stat *in, struct ostat32 *out)
 {
 
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	CP(*in, *out, st_size);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 }
 #endif
 
 int
 freebsd32_stat(struct thread *td, struct freebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct stat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error)
 		return (error);
 	copy_stat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_stat(struct thread *td, struct ofreebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstat(struct thread *td, struct freebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->ub, sizeof(ub32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_fstat(struct thread *td, struct ofreebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct ostat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_ostat(&ub, &ub32);
 	error = copyout(&ub32, uap->ub, sizeof(ub32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstatat(struct thread *td, struct freebsd32_fstatat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE,
 	    &ub, NULL);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->buf, sizeof(ub32));
 	return (error);
 }
 
 int
 freebsd32_lstat(struct thread *td, struct freebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct stat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error)
 		return (error);
 	copy_stat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lstat(struct thread *td, struct ofreebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_sysctl(struct thread *td, struct freebsd32_sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j, oldlen;
 	uint32_t tmp;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
  	error = copyin(uap->name, name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 	if (uap->oldlenp) {
 		error = fueword32(uap->oldlenp, &tmp);
 		oldlen = tmp;
 	} else {
 		oldlen = 0;
 	}
 	if (error != 0)
 		return (EFAULT);
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, &oldlen, 1,
 		uap->new, uap->newlen, &j, SCTL_MASK32);
 	if (error && error != ENOMEM)
 		return (error);
 	if (uap->oldlenp)
 		suword32(uap->oldlenp, j);
 	return (0);
 }
 
 int
 freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		/* FreeBSD single IPv4 jails. */
 		struct jail32_v0 j32_v0;
 
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0));
 		if (error)
 			return (error);
 		CP(j32_v0, j, version);
 		PTRIN_CP(j32_v0, j, path);
 		PTRIN_CP(j32_v0, j, hostname);
 		j.ip4s = htonl(j32_v0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 	{
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		struct jail32 j32;
 
 		error = copyin(uap->jail, &j32, sizeof(struct jail32));
 		if (error)
 			return (error);
 		CP(j32, j, version);
 		PTRIN_CP(j32, j, path);
 		PTRIN_CP(j32, j, hostname);
 		PTRIN_CP(j32, j, jailname);
 		CP(j32, j, ip4s);
 		CP(j32, j, ip6s);
 		PTRIN_CP(j32, j, ip4);
 		PTRIN_CP(j32, j, ip6);
 		break;
 	}
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 freebsd32_jail_set(struct thread *td, struct freebsd32_jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_jail_get(struct thread *td, struct freebsd32_jail_get_args *uap)
 {
 	struct iovec32 iov32;
 	struct uio *auio;
 	int error, i;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		for (i = 0; i < uap->iovcnt; i++) {
 			PTROUT_CP(auio->uio_iov[i], iov32, iov_base);
 			CP(auio->uio_iov[i], iov32, iov_len);
 			error = copyout(&iov32, uap->iovp + i, sizeof(iov32));
 			if (error != 0)
 				break;
 		}
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_sigaction(struct thread *td, struct freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, 0);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sigaction(struct thread *td,
 			     struct freebsd4_freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, KSA_FREEBSD4);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_43
 struct osigaction32 {
 	u_int32_t	sa_u;
 	osigset_t	sa_mask;
 	int		sa_flags;
 };
 
 #define	ONSIG	32
 
 int
 ofreebsd32_sigaction(struct thread *td,
 			     struct ofreebsd32_sigaction_args *uap)
 {
 	struct osigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsa) {
 		error = copyin(uap->nsa, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		OSIG2SIG(s32.sa_mask, sa.sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osa != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		SIG2OSIG(osa.sa_mask, s32.sa_mask);
 		error = copyout(&s32, uap->osa, sizeof(s32));
 	}
 	return (error);
 }
 
 int
 ofreebsd32_sigprocmask(struct thread *td,
 			       struct ofreebsd32_sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, SIGPROCMASK_OLD);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 
 int
 ofreebsd32_sigpending(struct thread *td,
 			      struct ofreebsd32_sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t siglist;
 
 	PROC_LOCK(p);
 	siglist = p->p_siglist;
 	SIGSETOR(siglist, td->td_siglist);
 	PROC_UNLOCK(p);
 	SIG2OSIG(siglist, td->td_retval[0]);
 	return (0);
 }
 
 struct sigvec32 {
 	u_int32_t	sv_handler;
 	int		sv_mask;
 	int		sv_flags;
 };
 
 int
 ofreebsd32_sigvec(struct thread *td,
 			  struct ofreebsd32_sigvec_args *uap)
 {
 	struct sigvec32 vec;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsv) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(vec.sv_handler);
 		OSIG2SIG(vec.sv_mask, sa.sa_mask);
 		sa.sa_flags = vec.sv_flags;
 		sa.sa_flags ^= SA_RESTART;
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osv != NULL) {
 		vec.sv_handler = PTROUT(osa.sa_handler);
 		SIG2OSIG(osa.sa_mask, vec.sv_mask);
 		vec.sv_flags = osa.sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 int
 ofreebsd32_sigblock(struct thread *td,
 			    struct ofreebsd32_sigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 int
 ofreebsd32_sigsetmask(struct thread *td,
 			      struct ofreebsd32_sigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 int
 ofreebsd32_sigsuspend(struct thread *td,
 			      struct ofreebsd32_sigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 
 struct sigstack32 {
 	u_int32_t	ss_sp;
 	int		ss_onstack;
 };
 
 int
 ofreebsd32_sigstack(struct thread *td,
 			    struct ofreebsd32_sigstack_args *uap)
 {
 	struct sigstack32 s32;
 	struct sigstack nss, oss;
 	int error = 0, unss;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		nss.ss_sp = PTRIN(s32.ss_sp);
 		CP(s32, nss, ss_onstack);
 		unss = 1;
 	} else {
 		unss = 0;
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (unss) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= (nss.ss_onstack & SS_ONSTACK);
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL) {
 		s32.ss_sp = PTROUT(oss.ss_sp);
 		CP(oss, s32, ss_onstack);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_nanosleep(struct thread *td, struct freebsd32_nanosleep_args *uap)
 {
 	struct timespec32 rmt32, rqt32;
 	struct timespec rmt, rqt;
 	int error;
 
 	error = copyin(uap->rqtp, &rqt32, sizeof(rqt32));
 	if (error)
 		return (error);
 
 	CP(rqt32, rqt, tv_sec);
 	CP(rqt32, rqt, tv_nsec);
 
 	if (uap->rmtp &&
 	    !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
 		return (EFAULT);
 	error = kern_nanosleep(td, &rqt, &rmt);
 	if (error && uap->rmtp) {
 		int error2;
 
 		CP(rmt, rmt32, tv_sec);
 		CP(rmt, rmt32, tv_nsec);
 
 		error2 = copyout(&rmt32, uap->rmtp, sizeof(rmt32));
 		if (error2)
 			error = error2;
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_gettime(struct thread *td,
 			struct freebsd32_clock_gettime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0) {
 		CP(ats, ats32, tv_sec);
 		CP(ats, ats32, tv_nsec);
 		error = copyout(&ats32, uap->tp, sizeof(ats32));
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_settime(struct thread *td,
 			struct freebsd32_clock_settime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = copyin(uap->tp, &ats32, sizeof(ats32));
 	if (error)
 		return (error);
 	CP(ats32, ats, tv_sec);
 	CP(ats32, ats, tv_nsec);
 
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 int
 freebsd32_clock_getres(struct thread *td,
 		       struct freebsd32_clock_getres_args *uap)
 {
 	struct timespec	ts;
 	struct timespec32 ts32;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0) {
 		CP(ts, ts32, tv_sec);
 		CP(ts, ts32, tv_nsec);
 		error = copyout(&ts32, uap->tp, sizeof(ts32));
 	}
 	return (error);
 }
 
 int freebsd32_ktimer_create(struct thread *td,
     struct freebsd32_ktimer_create_args *uap)
 {
 	struct sigevent32 ev32;
 	struct sigevent ev, *evp;
 	int error, id;
 
 	if (uap->evp == NULL) {
 		evp = NULL;
 	} else {
 		evp = &ev;
 		error = copyin(uap->evp, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 	}
 	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_settime(struct thread *td,
     struct freebsd32_ktimer_settime_args *uap)
 {
 	struct itimerspec32 val32, oval32;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val32, sizeof(val32));
 	if (error != 0)
 		return (error);
 	ITS_CP(val32, val);
 	ovalp = uap->ovalue != NULL ? &oval : NULL;
 	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
 	if (error == 0 && uap->ovalue != NULL) {
 		ITS_CP(oval, oval32);
 		error = copyout(&oval32, uap->ovalue, sizeof(oval32));
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_gettime(struct thread *td,
     struct freebsd32_ktimer_gettime_args *uap)
 {
 	struct itimerspec32 val32;
 	struct itimerspec val;
 	int error;
 
 	error = kern_ktimer_gettime(td, uap->timerid, &val);
 	if (error == 0) {
 		ITS_CP(val, val32);
 		error = copyout(&val32, uap->value, sizeof(val32));
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_getcpuclockid2(struct thread *td,
     struct freebsd32_clock_getcpuclockid2_args *uap)
 {
 	clockid_t clk_id;
 	int error;
 
 	error = kern_clock_getcpuclockid2(td, PAIR32TO64(id_t, uap->id),
 	    uap->which, &clk_id);
 	if (error == 0)
 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 	return (error);
 }
 
 int
 freebsd32_thr_new(struct thread *td,
 		  struct freebsd32_thr_new_args *uap)
 {
 	struct thr_param32 param32;
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 ||
 	    uap->param_size > sizeof(struct thr_param32))
 		return (EINVAL);
 	bzero(&param, sizeof(struct thr_param));
 	bzero(&param32, sizeof(struct thr_param32));
 	error = copyin(uap->param, &param32, uap->param_size);
 	if (error != 0)
 		return (error);
 	param.start_func = PTRIN(param32.start_func);
 	param.arg = PTRIN(param32.arg);
 	param.stack_base = PTRIN(param32.stack_base);
 	param.stack_size = param32.stack_size;
 	param.tls_base = PTRIN(param32.tls_base);
 	param.tls_size = param32.tls_size;
 	param.child_tid = PTRIN(param32.child_tid);
 	param.parent_tid = PTRIN(param32.parent_tid);
 	param.flags = param32.flags;
 	param.rtp = PTRIN(param32.rtp);
 	param.spare[0] = PTRIN(param32.spare[0]);
 	param.spare[1] = PTRIN(param32.spare[1]);
 	param.spare[2] = PTRIN(param32.spare[2]);
 
 	return (kern_thr_new(td, &param));
 }
 
 int
 freebsd32_thr_suspend(struct thread *td, struct freebsd32_thr_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	error = 0;
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = copyin((const void *)uap->timeout, (void *)&ts32,
 		    sizeof(struct timespec32));
 		if (error != 0)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		tsp = &ts;
 	}
 	return (kern_thr_suspend(td, tsp));
 }
 
 void
 siginfo_to_siginfo32(const siginfo_t *src, struct siginfo32 *dst)
 {
 	bzero(dst, sizeof(*dst));
 	dst->si_signo = src->si_signo;
 	dst->si_errno = src->si_errno;
 	dst->si_code = src->si_code;
 	dst->si_pid = src->si_pid;
 	dst->si_uid = src->si_uid;
 	dst->si_status = src->si_status;
 	dst->si_addr = (uintptr_t)src->si_addr;
 	dst->si_value.sival_int = src->si_value.sival_int;
 	dst->si_timerid = src->si_timerid;
 	dst->si_overrun = src->si_overrun;
 }
 
 int
 freebsd32_sigtimedwait(struct thread *td, struct freebsd32_sigtimedwait_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 freebsd32_sigwaitinfo(struct thread *td, struct freebsd32_sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 freebsd32_cpuset_setid(struct thread *td,
     struct freebsd32_cpuset_setid_args *uap)
 {
 	struct cpuset_setid_args ap;
 
 	ap.which = uap->which;
 	ap.id = PAIR32TO64(id_t,uap->id);
 	ap.setid = uap->setid;
 
 	return (sys_cpuset_setid(td, &ap));
 }
 
 int
 freebsd32_cpuset_getid(struct thread *td,
     struct freebsd32_cpuset_getid_args *uap)
 {
 	struct cpuset_getid_args ap;
 
 	ap.level = uap->level;
 	ap.which = uap->which;
 	ap.id = PAIR32TO64(id_t,uap->id);
 	ap.setid = uap->setid;
 
 	return (sys_cpuset_getid(td, &ap));
 }
 
 int
 freebsd32_cpuset_getaffinity(struct thread *td,
     struct freebsd32_cpuset_getaffinity_args *uap)
 {
 	struct cpuset_getaffinity_args ap;
 
 	ap.level = uap->level;
 	ap.which = uap->which;
 	ap.id = PAIR32TO64(id_t,uap->id);
 	ap.cpusetsize = uap->cpusetsize;
 	ap.mask = uap->mask;
 
 	return (sys_cpuset_getaffinity(td, &ap));
 }
 
 int
 freebsd32_cpuset_setaffinity(struct thread *td,
     struct freebsd32_cpuset_setaffinity_args *uap)
 {
 	struct cpuset_setaffinity_args ap;
 
 	ap.level = uap->level;
 	ap.which = uap->which;
 	ap.id = PAIR32TO64(id_t,uap->id);
 	ap.cpusetsize = uap->cpusetsize;
 	ap.mask = uap->mask;
 
 	return (sys_cpuset_setaffinity(td, &ap));
 }
 
 int
 freebsd32_nmount(struct thread *td,
     struct freebsd32_nmount_args /* {
     	struct iovec *iovp;
     	unsigned int iovcnt;
     	int flags;
     } */ *uap)
 {
 	struct uio *auio;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	/*
 	 * check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((uap->iovcnt & 1) || (uap->iovcnt < 4))
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = vfs_donmount(td, flags, auio);
 
 	free(auio, M_IOV);
 	return error;
 }
 
 #if 0
 int
 freebsd32_xxx(struct thread *td, struct freebsd32_xxx_args *uap)
 {
 	struct yyy32 *p32, s32;
 	struct yyy *p = NULL, s;
 	struct xxx_arg ap;
 	int error;
 
 	if (uap->zzz) {
 		error = copyin(uap->zzz, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		/* translate in */
 		p = &s;
 	}
 	error = kern_xxx(td, p);
 	if (error)
 		return (error);
 	if (uap->zzz) {
 		/* translate out */
 		error = copyout(&s32, p32, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 syscall32_register(int *offset, struct sysent *new_sysent,
     struct sysent *old_sysent, int flags)
 {
 
 	if ((flags & ~SY_THR_STATIC) != 0)
 		return (EINVAL);
 
 	if (*offset == NO_SYSCALL) {
 		int i;
 
 		for (i = 1; i < SYS_MAXSYSCALL; ++i)
 			if (freebsd32_sysent[i].sy_call ==
 			    (sy_call_t *)lkmnosys)
 				break;
 		if (i == SYS_MAXSYSCALL)
 			return (ENFILE);
 		*offset = i;
 	} else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
 		return (EINVAL);
 	else if (freebsd32_sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
 	    freebsd32_sysent[*offset].sy_call != (sy_call_t *)lkmressys)
 		return (EEXIST);
 
 	*old_sysent = freebsd32_sysent[*offset];
 	freebsd32_sysent[*offset] = *new_sysent;
 	atomic_store_rel_32(&freebsd32_sysent[*offset].sy_thrcnt, flags);
 	return (0);
 }
 
 int
 syscall32_deregister(int *offset, struct sysent *old_sysent)
 {
 
 	if (*offset == 0)
 		return (0);
 
 	freebsd32_sysent[*offset] = *old_sysent;
 	return (0);
 }
 
 int
 syscall32_module_handler(struct module *mod, int what, void *arg)
 {
 	struct syscall_module_data *data = (struct syscall_module_data*)arg;
 	modspecific_t ms;
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		error = syscall32_register(data->offset, data->new_sysent,
 		    &data->old_sysent, SY_THR_STATIC_KLD);
 		if (error) {
 			/* Leave a mark so we know to safely unload below. */
 			data->offset = NULL;
 			return error;
 		}
 		ms.intval = *data->offset;
 		MOD_XLOCK;
 		module_setspecific(mod, &ms);
 		MOD_XUNLOCK;
 		if (data->chainevh)
 			error = data->chainevh(mod, what, data->chainarg);
 		return (error);
 	case MOD_UNLOAD:
 		/*
 		 * MOD_LOAD failed, so just return without calling the
 		 * chained handler since we didn't pass along the MOD_LOAD
 		 * event.
 		 */
 		if (data->offset == NULL)
 			return (0);
 		if (data->chainevh) {
 			error = data->chainevh(mod, what, data->chainarg);
 			if (error)
 				return (error);
 		}
 		error = syscall32_deregister(data->offset, &data->old_sysent);
 		return (error);
 	default:
 		error = EOPNOTSUPP;
 		if (data->chainevh)
 			error = data->chainevh(mod, what, data->chainarg);
 		return (error);
 	}
 }
 
 int
 syscall32_helper_register(struct syscall_helper_data *sd, int flags)
 {
 	struct syscall_helper_data *sd1;
 	int error;
 
 	for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
 		error = syscall32_register(&sd1->syscall_no, &sd1->new_sysent,
 		    &sd1->old_sysent, flags);
 		if (error != 0) {
 			syscall32_helper_unregister(sd);
 			return (error);
 		}
 		sd1->registered = 1;
 	}
 	return (0);
 }
 
 int
 syscall32_helper_unregister(struct syscall_helper_data *sd)
 {
 	struct syscall_helper_data *sd1;
 
 	for (sd1 = sd; sd1->registered != 0; sd1++) {
 		syscall32_deregister(&sd1->syscall_no, &sd1->old_sysent);
 		sd1->registered = 0;
 	}
 	return (0);
 }
 
 register_t *
 freebsd32_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc, i;
 	u_int32_t *vectp;
 	char *stringp;
 	uintptr_t destp;
 	u_int32_t *stack_base;
 	struct freebsd32_ps_strings *arginfo;
 	char canary[sizeof(long) * 8];
 	int32_t pagesizes32[MAXPAGESIZES];
 	size_t execpath_len;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	arginfo = (struct freebsd32_ps_strings *)curproc->p_sysent->
 	    sv_psstrings;
 	if (imgp->proc->p_sysent->sv_sigcode_base == 0)
 		szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
 	else
 		szsigcode = 0;
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(uint32_t));
 		copyout(imgp->proc->p_sysent->sv_sigcode, (void *)destp,
 		    szsigcode);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		imgp->execpathp = destp;
 		copyout(imgp->execpath, (void *)destp, execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = destp;
 	copyout(canary, (void *)destp, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	for (i = 0; i < MAXPAGESIZES; i++)
 		pagesizes32[i] = (uint32_t)pagesizes[i];
 	destp -= sizeof(pagesizes32);
 	destp = rounddown2(destp, sizeof(uint32_t));
 	imgp->pagesizes = destp;
 	copyout(pagesizes32, (void *)destp, sizeof(pagesizes32));
 	imgp->pagesizeslen = sizeof(pagesizes32);
 
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(uint32_t));
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
 			: (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (u_int32_t *) (destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
 		    sizeof(u_int32_t));
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (u_int32_t *)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2) * sizeof(u_int32_t));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword32(vectp++, 0);
 
 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword32(vectp, 0);
 
 	return ((register_t *)stack_base);
 }
 
 int
 freebsd32_kldstat(struct thread *td, struct freebsd32_kldstat_args *uap)
 {
 	struct kld_file_stat stat;
 	struct kld32_file_stat stat32;
 	int error, version;
 
 	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
 	    != 0)
 		return (error);
 	if (version != sizeof(struct kld32_file_stat_1) &&
 	    version != sizeof(struct kld32_file_stat))
 		return (EINVAL);
 
 	error = kern_kldstat(td, uap->fileid, &stat);
 	if (error != 0)
 		return (error);
 
 	bcopy(&stat.name[0], &stat32.name[0], sizeof(stat.name));
 	CP(stat, stat32, refs);
 	CP(stat, stat32, id);
 	PTROUT_CP(stat, stat32, address);
 	CP(stat, stat32, size);
 	bcopy(&stat.pathname[0], &stat32.pathname[0], sizeof(stat.pathname));
 	return (copyout(&stat32, uap->stat, version));
 }
 
 int
 freebsd32_posix_fallocate(struct thread *td,
     struct freebsd32_posix_fallocate_args *uap)
 {
 	int error;
 
 	error = kern_posix_fallocate(td, uap->fd,
 	    PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len));
 	return (kern_posix_error(td, error));
 }
 
 int
 freebsd32_posix_fadvise(struct thread *td,
     struct freebsd32_posix_fadvise_args *uap)
 {
 	int error;
 
 	error = kern_posix_fadvise(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    PAIR32TO64(off_t, uap->len), uap->advice);
 	return (kern_posix_error(td, error));
 }
 
 int
 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
 {
 
 	CP(*sig32, *sig, sigev_notify);
 	switch (sig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_THREAD_ID:
 		CP(*sig32, *sig, sigev_notify_thread_id);
 		/* FALLTHROUGH */
 	case SIGEV_SIGNAL:
 		CP(*sig32, *sig, sigev_signo);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	case SIGEV_KEVENT:
 		CP(*sig32, *sig, sigev_notify_kqueue);
 		CP(*sig32, *sig, sigev_notify_kevent_flags);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 freebsd32_procctl(struct thread *td, struct freebsd32_procctl_args *uap)
 {
 	void *data;
 	union {
 		struct procctl_reaper_status rs;
 		struct procctl_reaper_pids rp;
 		struct procctl_reaper_kill rk;
 	} x;
 	union {
 		struct procctl_reaper_pids32 rp;
 	} x32;
 	int error, error1, flags;
 
 	switch (uap->com) {
 	case PROC_SPROTECT:
 	case PROC_TRACE_CTL:
 	case PROC_TRAPCAP_CTL:
 		error = copyin(PTRIN(uap->data), &flags, sizeof(flags));
 		if (error != 0)
 			return (error);
 		data = &flags;
 		break;
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 		if (uap->data != NULL)
 			return (EINVAL);
 		data = NULL;
 		break;
 	case PROC_REAP_STATUS:
 		data = &x.rs;
 		break;
 	case PROC_REAP_GETPIDS:
 		error = copyin(uap->data, &x32.rp, sizeof(x32.rp));
 		if (error != 0)
 			return (error);
 		CP(x32.rp, x.rp, rp_count);
 		PTRIN_CP(x32.rp, x.rp, rp_pids);
 		data = &x.rp;
 		break;
 	case PROC_REAP_KILL:
 		error = copyin(uap->data, &x.rk, sizeof(x.rk));
 		if (error != 0)
 			return (error);
 		data = &x.rk;
 		break;
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		data = &flags;
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = kern_procctl(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    uap->com, data);
 	switch (uap->com) {
 	case PROC_REAP_STATUS:
 		if (error == 0)
 			error = copyout(&x.rs, uap->data, sizeof(x.rs));
 		break;
 	case PROC_REAP_KILL:
 		error1 = copyout(&x.rk, uap->data, sizeof(x.rk));
 		if (error == 0)
 			error = error1;
 		break;
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		if (error == 0)
 			error = copyout(&flags, uap->data, sizeof(flags));
 		break;
 	}
 	return (error);
 }
 
 int
 freebsd32_fcntl(struct thread *td, struct freebsd32_fcntl_args *uap)
 {
 	long tmp;
 
 	switch (uap->cmd) {
 	/*
 	 * Do unsigned conversion for arg when operation
 	 * interprets it as flags or pointer.
 	 */
 	case F_SETLK_REMOTE:
 	case F_SETLKW:
 	case F_SETLK:
 	case F_GETLK:
 	case F_SETFD:
 	case F_SETFL:
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		tmp = (unsigned int)(uap->arg);
 		break;
 	default:
 		tmp = uap->arg;
 		break;
 	}
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, tmp));
 }
 
 int
 freebsd32_ppoll(struct thread *td, struct freebsd32_ppoll_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
Index: stable/11/sys/compat/linux/linux_misc.c
===================================================================
--- stable/11/sys/compat/linux/linux_misc.c	(revision 314333)
+++ stable/11/sys/compat/linux/linux_misc.c	(revision 314334)
@@ -1,2524 +1,2517 @@
 /*-
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/blist.h>
 #include <sys/fcntl.h>
 #if defined(__i386__)
 #include <sys/imgact_aout.h>
 #endif
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/cpuset.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 #include <compat/linux/linux_sysproto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 
 /**
  * Special DTrace provider for the linuxulator.
  *
  * In this file we define the provider for the entire linuxulator. All
  * modules (= files of the linuxulator) use it.
  *
  * We define a different name depending on the emulated bitsize, see
  * ../../<ARCH>/linux{,32}/linux.h, e.g.:
  *      native bitsize          = linuxulator
  *      amd64, 32bit emulation  = linuxulator32
  */
 LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE);
 
 int stclohz;				/* Statistics clock frequency */
 
 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 	RLIMIT_MEMLOCK, RLIMIT_AS 
 };
 
 struct l_sysinfo {
 	l_long		uptime;		/* Seconds since boot */
 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
 #define LINUX_SYSINFO_LOADS_SCALE 65536
 	l_ulong		totalram;	/* Total usable main memory size */
 	l_ulong		freeram;	/* Available memory size */
 	l_ulong		sharedram;	/* Amount of shared memory */
 	l_ulong		bufferram;	/* Memory used by buffers */
 	l_ulong		totalswap;	/* Total swap space size */
 	l_ulong		freeswap;	/* swap space still available */
 	l_ushort	procs;		/* Number of current processes */
 	l_ushort	pads;
 	l_ulong		totalbig;
 	l_ulong		freebig;
 	l_uint		mem_unit;
 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
 };
 
 struct l_pselect6arg {
 	l_uintptr_t	ss;
 	l_size_t	ss_len;
 };
 
 static int	linux_utimensat_nsec_valid(l_long);
 
 
 int
 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 {
 	struct l_sysinfo sysinfo;
 	vm_object_t object;
 	int i, j;
 	struct timespec ts;
 
 	bzero(&sysinfo, sizeof(sysinfo));
 	getnanouptime(&ts);
 	if (ts.tv_nsec != 0)
 		ts.tv_sec++;
 	sysinfo.uptime = ts.tv_sec;
 
 	/* Use the information from the mib to get our load averages */
 	for (i = 0; i < 3; i++)
 		sysinfo.loads[i] = averunnable.ldavg[i] *
 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 
 	sysinfo.totalram = physmem * PAGE_SIZE;
 	sysinfo.freeram = sysinfo.totalram - vm_cnt.v_wire_count * PAGE_SIZE;
 
 	sysinfo.sharedram = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			sysinfo.sharedram += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 
 	sysinfo.sharedram *= PAGE_SIZE;
 	sysinfo.bufferram = 0;
 
 	swap_pager_status(&i, &j);
 	sysinfo.totalswap = i * PAGE_SIZE;
 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
 
 	sysinfo.procs = nprocs;
 
 	/* The following are only present in newer Linux kernels. */
 	sysinfo.totalbig = 0;
 	sysinfo.freebig = 0;
 	sysinfo.mem_unit = 1;
 
 	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
 }
 
 int
 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 {
 	struct itimerval it, old_it;
 	u_int secs;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(alarm))
 		printf(ARGS(alarm, "%u"), args->secs);
 #endif
 	secs = args->secs;
 	/*
 	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
 	 * to match kern_setitimer()'s limit to avoid error from it.
 	 *
 	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
 	 * platforms.
 	 */
 	if (secs > INT32_MAX / 2)
 		secs = INT32_MAX / 2;
 
 	it.it_value.tv_sec = secs;
 	it.it_value.tv_usec = 0;
 	timevalclear(&it.it_interval);
 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 	KASSERT(error == 0, ("kern_setitimer returns %d", error));
 
 	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
 	    old_it.it_value.tv_usec >= 500000)
 		old_it.it_value.tv_sec++;
 	td->td_retval[0] = old_it.it_value.tv_sec;
 	return (0);
 }
 
 int
 linux_brk(struct thread *td, struct linux_brk_args *args)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_offset_t new, old;
 	struct obreak_args /* {
 		char * nsize;
 	} */ tmp;
 
 #ifdef DEBUG
 	if (ldebug(brk))
 		printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
 #endif
 	old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
 	new = (vm_offset_t)args->dsend;
 	tmp.nsize = (char *)new;
 	if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp))
 		td->td_retval[0] = (long)new;
 	else
 		td->td_retval[0] = (long)old;
 
 	return (0);
 }
 
 #if defined(__i386__)
 /* XXX: what about amd64/linux32? */
 
 int
 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 {
 	struct nameidata ni;
 	struct vnode *vp;
 	struct exec *a_out;
 	struct vattr attr;
 	vm_offset_t vmaddr;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	char *library;
 	ssize_t aresid;
 	int error, locked, writecount;
 
 	LCONVPATHEXIST(td, args->library, &library);
 
 #ifdef DEBUG
 	if (ldebug(uselib))
 		printf(ARGS(uselib, "%s"), library);
 #endif
 
 	a_out = NULL;
 	locked = 0;
 	vp = NULL;
 
 	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, library, td);
 	error = namei(&ni);
 	LFREEPATH(library);
 	if (error)
 		goto cleanup;
 
 	vp = ni.ni_vp;
 	NDFREE(&ni, NDF_ONLY_PNBUF);
 
 	/*
 	 * From here on down, we have a locked vnode that must be unlocked.
 	 * XXX: The code below largely duplicates exec_check_permissions().
 	 */
 	locked = 1;
 
 	/* Writable? */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		goto cleanup;
 	if (writecount != 0) {
 		error = ETXTBSY;
 		goto cleanup;
 	}
 
 	/* Executable? */
 	error = VOP_GETATTR(vp, &attr, td->td_ucred);
 	if (error)
 		goto cleanup;
 
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 		/* EACCESS is what exec(2) returns. */
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Sensible size? */
 	if (attr.va_size == 0) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Can we access it? */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		goto cleanup;
 
 	/*
 	 * XXX: This should use vn_open() so that it is properly authorized,
 	 * and to reduce code redundancy all over the place here.
 	 * XXX: Not really, it duplicates far more of exec_check_permissions()
 	 * than vn_open().
 	 */
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
 	if (error)
 		goto cleanup;
 #endif
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error)
 		goto cleanup;
 
 	/* Pull in executable header into exec_map */
 	error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 	if (error)
 		goto cleanup;
 
 	/* Is it a Linux binary ? */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/*
 	 * While we are here, we should REALLY do some more checks
 	 */
 
 	/* Set file/virtual offset based on a.out variant. */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case 0413:			/* ZMAGIC */
 		file_offset = 1024;
 		break;
 	case 0314:			/* QMAGIC */
 		file_offset = 0;
 		break;
 	default:
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	bss_size = round_page(a_out->a_bss);
 
 	/* Check various fields in header for validity/bounds. */
 	if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > attr.va_size) {
 		error = EFAULT;
 		goto cleanup;
 	}
 
 	/*
 	 * text/data/bss must not exceed limits
 	 * XXX - this is not complete. it should check current usage PLUS
 	 * the resources needed by this library.
 	 */
 	PROC_LOCK(td->td_proc);
 	if (a_out->a_text > maxtsiz ||
 	    a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) ||
 	    racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
 	    bss_size) != 0) {
 		PROC_UNLOCK(td->td_proc);
 		error = ENOMEM;
 		goto cleanup;
 	}
 	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * Prevent more writers.
 	 * XXX: Note that if any of the VM operations fail below we don't
 	 * clear this flag.
 	 */
 	VOP_SET_TEXT(vp);
 
 	/*
 	 * Lock no longer needed
 	 */
 	locked = 0;
 	VOP_UNLOCK(vp, 0);
 
 	/*
 	 * Check if file_offset page aligned. Currently we cannot handle
 	 * misalinged file offsets, and so we read in the entire image
 	 * (what a waste).
 	 */
 	if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 		printf("uselib: Non page aligned binary %lu\n", file_offset);
 #endif
 		/* Map text+data read/write/execute */
 
 		/* a_entry is the load address and is page aligned */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/* get anon user mapping, read+write+execute */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 
 		error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset,
 		    a_out->a_text + a_out->a_data, UIO_USERSPACE, 0,
 		    td->td_ucred, NOCRED, &aresid, td);
 		if (error != 0)
 			goto cleanup;
 		if (aresid != 0) {
 			error = ENOEXEC;
 			goto cleanup;
 		}
 	} else {
 #ifdef DEBUG
 		printf("uselib: Page aligned binary %lu\n", file_offset);
 #endif
 		/*
 		 * for QMAGIC, a_entry is 20 bytes beyond the load address
 		 * to skip the executable header
 		 */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/*
 		 * Map it all into the process's space as a single
 		 * copy-on-write "data" segment.
 		 */
 		error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
 		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 		if (error)
 			goto cleanup;
 	}
 #ifdef DEBUG
 	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
 	    ((long *)vmaddr)[1]);
 #endif
 	if (bss_size != 0) {
 		/* Calculate BSS start address */
 		vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 		    a_out->a_data;
 
 		/* allocate some 'anon' space */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 	}
 
 cleanup:
 	/* Unlock vnode if needed */
 	if (locked)
 		VOP_UNLOCK(vp, 0);
 
 	/* Release the temporary mapping. */
 	if (a_out)
 		kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE);
 
 	return (error);
 }
 
 #endif	/* __i386__ */
 
 int
 linux_select(struct thread *td, struct linux_select_args *args)
 {
 	l_timeval ltv;
 	struct timeval tv0, tv1, utv, *tvp;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
 		    (void *)args->readfds, (void *)args->writefds,
 		    (void *)args->exceptfds, (void *)args->timeout);
 #endif
 
 	/*
 	 * Store current time for computation of the amount of
 	 * time left.
 	 */
 	if (args->timeout) {
 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 			goto select_out;
 		utv.tv_sec = ltv.tv_sec;
 		utv.tv_usec = ltv.tv_usec;
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("incoming timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 
 		if (itimerfix(&utv)) {
 			/*
 			 * The timeval was invalid.  Convert it to something
 			 * valid that will act as it does under Linux.
 			 */
 			utv.tv_sec += utv.tv_usec / 1000000;
 			utv.tv_usec %= 1000000;
 			if (utv.tv_usec < 0) {
 				utv.tv_sec -= 1;
 				utv.tv_usec += 1000000;
 			}
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		}
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp, LINUX_NFDBITS);
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("real select returns %d"), error);
 #endif
 	if (error)
 		goto select_out;
 
 	if (args->timeout) {
 		if (td->td_retval[0]) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("outgoing timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 		ltv.tv_sec = utv.tv_sec;
 		ltv.tv_usec = utv.tv_usec;
 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 			goto select_out;
 	}
 
 select_out:
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("select_out -> %d"), error);
 #endif
 	return (error);
 }
 
 int
 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 {
-	struct munmap_args /* {
-		void *addr;
-		size_t len;
-	} */ bsd_args;
+	uintptr_t addr;
+	size_t len;
 	int error = 0;
 
 #ifdef DEBUG
 	if (ldebug(mremap))
 		printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
 		    (void *)(uintptr_t)args->addr,
 		    (unsigned long)args->old_len,
 		    (unsigned long)args->new_len,
 		    (unsigned long)args->flags);
 #endif
 
 	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	/*
 	 * Check for the page alignment.
 	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
 	 */
 	if (args->addr & PAGE_MASK) {
 		td->td_retval[0] = 0;
 		return (EINVAL);
 	}
 
 	args->new_len = round_page(args->new_len);
 	args->old_len = round_page(args->old_len);
 
 	if (args->new_len > args->old_len) {
 		td->td_retval[0] = 0;
 		return (ENOMEM);
 	}
 
 	if (args->new_len < args->old_len) {
-		bsd_args.addr =
-		    (caddr_t)((uintptr_t)args->addr + args->new_len);
-		bsd_args.len = args->old_len - args->new_len;
-		error = sys_munmap(td, &bsd_args);
+		addr = args->addr + args->new_len;
+		len = args->old_len - args->new_len;
+		error = kern_munmap(td, addr, len);
 	}
 
 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 	return (error);
 }
 
 #define LINUX_MS_ASYNC       0x0001
 #define LINUX_MS_INVALIDATE  0x0002
 #define LINUX_MS_SYNC        0x0004
 
 int
 linux_msync(struct thread *td, struct linux_msync_args *args)
 {
-	struct msync_args bsd_args;
 
-	bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
-	bsd_args.len = (uintptr_t)args->len;
-	bsd_args.flags = args->fl & ~LINUX_MS_SYNC;
-
-	return (sys_msync(td, &bsd_args));
+	return (kern_msync(td, args->addr, args->len,
+	    args->fl & ~LINUX_MS_SYNC));
 }
 
 int
 linux_time(struct thread *td, struct linux_time_args *args)
 {
 	struct timeval tv;
 	l_time_t tm;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(time))
 		printf(ARGS(time, "*"));
 #endif
 
 	microtime(&tv);
 	tm = tv.tv_sec;
 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 		return (error);
 	td->td_retval[0] = tm;
 	return (0);
 }
 
 struct l_times_argv {
 	l_clock_t	tms_utime;
 	l_clock_t	tms_stime;
 	l_clock_t	tms_cutime;
 	l_clock_t	tms_cstime;
 };
 
 
 /*
  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
  * auxiliary vector entry.
  */
 #define	CLK_TCK		100
 
 #define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 #define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
 
 #define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER_2004000 ?		\
 			    CONVNTCK(r) : CONVOTCK(r))
 
 int
 linux_times(struct thread *td, struct linux_times_args *args)
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct l_times_argv tms;
 	struct proc *p;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(times))
 		printf(ARGS(times, "*"));
 #endif
 
 	if (args->buf != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &utime, &stime);
 		PROC_STATUNLOCK(p);
 		calccru(p, &cutime, &cstime);
 		PROC_UNLOCK(p);
 
 		tms.tms_utime = CONVTCK(utime);
 		tms.tms_stime = CONVTCK(stime);
 
 		tms.tms_cutime = CONVTCK(cutime);
 		tms.tms_cstime = CONVTCK(cstime);
 
 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
 			return (error);
 	}
 
 	microuptime(&tv);
 	td->td_retval[0] = (int)CONVTCK(tv);
 	return (0);
 }
 
 int
 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 {
 	struct l_new_utsname utsname;
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 	char *p;
 
 #ifdef DEBUG
 	if (ldebug(newuname))
 		printf(ARGS(newuname, "*"));
 #endif
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 
 	bzero(&utsname, sizeof(utsname));
 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 	for (p = utsname.version; *p != '\0'; ++p)
 		if (*p == '\n') {
 			*p = '\0';
 			break;
 		}
 	strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME);
 
 	return (copyout(&utsname, args->buf, sizeof(utsname)));
 }
 
 struct l_utimbuf {
 	l_time_t l_actime;
 	l_time_t l_modtime;
 };
 
 int
 linux_utime(struct thread *td, struct linux_utime_args *args)
 {
 	struct timeval tv[2], *tvp;
 	struct l_utimbuf lut;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utime))
 		printf(ARGS(utime, "%s, *"), fname);
 #endif
 
 	if (args->times) {
 		if ((error = copyin(args->times, &lut, sizeof lut))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = lut.l_actime;
 		tv[0].tv_usec = 0;
 		tv[1].tv_sec = lut.l_modtime;
 		tv[1].tv_usec = 0;
 		tvp = tv;
 	} else
 		tvp = NULL;
 
 	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp,
 	    UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 int
 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utimes))
 		printf(ARGS(utimes, "%s, *"), fname);
 #endif
 
 	if (args->tptr != NULL) {
 		if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE,
 	    tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 static int
 linux_utimensat_nsec_valid(l_long nsec)
 {
 
 	if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW)
 		return (0);
 	if (nsec >= 0 && nsec <= 999999999)
 		return (0);
 	return (1);
 }
 
 int 
 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
 {
 	struct l_timespec l_times[2];
 	struct timespec times[2], *timesp = NULL;
 	char *path = NULL;
 	int error, dfd, flags = 0;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 
 #ifdef DEBUG
 	if (ldebug(utimensat))
 		printf(ARGS(utimensat, "%d, *"), dfd);
 #endif
 
 	if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	if (args->times != NULL) {
 		error = copyin(args->times, l_times, sizeof(l_times));
 		if (error != 0)
 			return (error);
 
 		if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 ||
 		    linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0)
 			return (EINVAL);
 
 		times[0].tv_sec = l_times[0].tv_sec;
 		switch (l_times[0].tv_nsec)
 		{
 		case LINUX_UTIME_OMIT:
 			times[0].tv_nsec = UTIME_OMIT;
 			break;
 		case LINUX_UTIME_NOW:
 			times[0].tv_nsec = UTIME_NOW;
 			break;
 		default:
 			times[0].tv_nsec = l_times[0].tv_nsec;
 		}
 
 		times[1].tv_sec = l_times[1].tv_sec;
 		switch (l_times[1].tv_nsec)
 		{
 		case LINUX_UTIME_OMIT:
 			times[1].tv_nsec = UTIME_OMIT;
 			break;
 		case LINUX_UTIME_NOW:
 			times[1].tv_nsec = UTIME_NOW;
 			break;
 		default:
 			times[1].tv_nsec = l_times[1].tv_nsec;
 			break;
 		}
 		timesp = times;
 
 		/* This breaks POSIX, but is what the Linux kernel does
 		 * _on purpose_ (documented in the man page for utimensat(2)),
 		 * so we must follow that behaviour. */
 		if (times[0].tv_nsec == UTIME_OMIT &&
 		    times[1].tv_nsec == UTIME_OMIT)
 			return (0);
 	}
 
 	if (args->pathname != NULL)
 		LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);
 	else if (args->flags != 0)
 		return (EINVAL);
 
 	if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW)
 		flags |= AT_SYMLINK_NOFOLLOW;
 
 	if (path == NULL)
 		error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE);
 	else {
 		error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp,
 	    		UIO_SYSSPACE, flags);
 		LFREEPATH(path);
 	}
 
 	return (error);
 }
 
 int
 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);
 
 #ifdef DEBUG
 	if (ldebug(futimesat))
 		printf(ARGS(futimesat, "%s, *"), fname);
 #endif
 
 	if (args->utimes != NULL) {
 		if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 int
 linux_common_wait(struct thread *td, int pid, int *status,
     int options, struct rusage *ru)
 {
 	int error, tmpstat;
 
 	error = kern_wait(td, pid, &tmpstat, options, ru);
 	if (error)
 		return (error);
 
 	if (status) {
 		tmpstat &= 0xffff;
 		if (WIFSIGNALED(tmpstat))
 			tmpstat = (tmpstat & 0xffffff80) |
 			    bsd_to_linux_signal(WTERMSIG(tmpstat));
 		else if (WIFSTOPPED(tmpstat))
 			tmpstat = (tmpstat & 0xffff00ff) |
 			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
 		else if (WIFCONTINUED(tmpstat))
 			tmpstat = 0xffff;
 		error = copyout(&tmpstat, status, sizeof(int));
 	}
 
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 {
 	struct linux_wait4_args wait4_args;
 
 #ifdef DEBUG
 	if (ldebug(waitpid))
 		printf(ARGS(waitpid, "%d, %p, %d"),
 		    args->pid, (void *)args->status, args->options);
 #endif
 
 	wait4_args.pid = args->pid;
 	wait4_args.status = args->status;
 	wait4_args.options = args->options;
 	wait4_args.rusage = NULL;
 
 	return (linux_wait4(td, &wait4_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_wait4(struct thread *td, struct linux_wait4_args *args)
 {
 	int error, options;
 	struct rusage ru, *rup;
 
 #ifdef DEBUG
 	if (ldebug(wait4))
 		printf(ARGS(wait4, "%d, %p, %d, %p"),
 		    args->pid, (void *)args->status, args->options,
 		    (void *)args->rusage);
 #endif
 	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
 	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
 		return (EINVAL);
 
 	options = WEXITED;
 	linux_to_bsd_waitopts(args->options, &options);
 
 	if (args->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = linux_common_wait(td, args->pid, args->status, options, rup);
 	if (error != 0)
 		return (error);
 	if (args->rusage != NULL)
 		error = linux_copyout_rusage(&ru, args->rusage);
 	return (error);
 }
 
 int
 linux_waitid(struct thread *td, struct linux_waitid_args *args)
 {
 	int status, options, sig;
 	struct __wrusage wru;
 	siginfo_t siginfo;
 	l_siginfo_t lsi;
 	idtype_t idtype;
 	struct proc *p;
 	int error;
 
 	options = 0;
 	linux_to_bsd_waitopts(args->options, &options);
 
 	if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED))
 		return (EINVAL);
 	if (!(options & (WEXITED | WUNTRACED | WCONTINUED)))
 		return (EINVAL);
 
 	switch (args->idtype) {
 	case LINUX_P_ALL:
 		idtype = P_ALL;
 		break;
 	case LINUX_P_PID:
 		if (args->id <= 0)
 			return (EINVAL);
 		idtype = P_PID;
 		break;
 	case LINUX_P_PGID:
 		if (args->id <= 0)
 			return (EINVAL);
 		idtype = P_PGID;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = kern_wait6(td, idtype, args->id, &status, options,
 	    &wru, &siginfo);
 	if (error != 0)
 		return (error);
 	if (args->rusage != NULL) {
 		error = linux_copyout_rusage(&wru.wru_children,
 		    args->rusage);
 		if (error != 0)
 			return (error);
 	}
 	if (args->info != NULL) {
 		p = td->td_proc;
 		if (td->td_retval[0] == 0)
 			bzero(&lsi, sizeof(lsi));
 		else {
 			sig = bsd_to_linux_signal(siginfo.si_signo);
 			siginfo_to_lsiginfo(&siginfo, &lsi, sig);
 		}
 		error = copyout(&lsi, args->info, sizeof(lsi));
 	}
 	td->td_retval[0] = 0;
 
 	return (error);
 }
 
 int
 linux_mknod(struct thread *td, struct linux_mknod_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHCREAT(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(mknod))
 		printf(ARGS(mknod, "%s, %d, %ju"), path, args->mode,
 		    (uintmax_t)args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    args->mode, args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 int
 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
 {
 	char *path;
 	int error, dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);
 
 #ifdef DEBUG
 	if (ldebug(mknodat))
 		printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
 		    args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_openat(td, dfd, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		if (error == 0)
 			kern_close(td, td->td_retval[0]);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 /*
  * UGH! This is just about the dumbest idea I've ever heard!!
  */
 int
 linux_personality(struct thread *td, struct linux_personality_args *args)
 {
 	struct linux_pemuldata *pem;
 	struct proc *p = td->td_proc;
 	uint32_t old;
 
 #ifdef DEBUG
 	if (ldebug(personality))
 		printf(ARGS(personality, "%u"), args->per);
 #endif
 
 	PROC_LOCK(p);
 	pem = pem_find(p);
 	old = pem->persona;
 	if (args->per != 0xffffffff)
 		pem->persona = args->per;
 	PROC_UNLOCK(p);
 
 	td->td_retval[0] = old;
 	return (0);
 }
 
 struct l_itimerval {
 	l_timeval it_interval;
 	l_timeval it_value;
 };
 
 #define	B2L_ITIMERVAL(bip, lip) 					\
 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
 
 int
 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv, oitv;
 
 #ifdef DEBUG
 	if (ldebug(setitimer))
 		printf(ARGS(setitimer, "%p, %p"),
 		    (void *)uap->itv, (void *)uap->oitv);
 #endif
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
 	}
 
 	error = copyin(uap->itv, &ls, sizeof(ls));
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&aitv, &ls);
 #ifdef DEBUG
 	if (ldebug(setitimer)) {
 		printf("setitimer: value: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
 		printf("setitimer: interval: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
 	}
 #endif
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	B2L_ITIMERVAL(&ls, &oitv);
 
 	return (copyout(&ls, uap->oitv, sizeof(ls)));
 }
 
 int
 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv;
 
 #ifdef DEBUG
 	if (ldebug(getitimer))
 		printf(ARGS(getitimer, "%p"), (void *)uap->itv);
 #endif
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&ls, &aitv);
 	return (copyout(&ls, uap->itv, sizeof(ls)));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_nice(struct thread *td, struct linux_nice_args *args)
 {
 	struct setpriority_args bsd_args;
 
 	bsd_args.which = PRIO_PROCESS;
 	bsd_args.who = 0;		/* current process */
 	bsd_args.prio = args->inc;
 	return (sys_setpriority(td, &bsd_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
 {
 	struct ucred *newcred, *oldcred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int ngrp, error;
 	struct proc *p;
 
 	ngrp = args->gidsetsize;
 	if (ngrp < 0 || ngrp >= ngroups_max + 1)
 		return (EINVAL);
 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
 	if (error)
 		goto out;
 	newcred = crget();
 	crextend(newcred, ngrp + 1);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 	crcopy(newcred, oldcred);
 
 	/*
 	 * cr_groups[0] holds egid. Setting the whole set from
 	 * the supplied set will cause egid to be changed too.
 	 * Keep cr_groups[0] unchanged to prevent that.
 	 */
 
 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	if (ngrp > 0) {
 		newcred->cr_ngroups = ngrp + 1;
 
 		bsd_gidset = newcred->cr_groups;
 		ngrp--;
 		while (ngrp >= 0) {
 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
 			ngrp--;
 		}
 	} else
 		newcred->cr_ngroups = 1;
 
 	setsugid(p);
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	error = 0;
 out:
 	free(linux_gidset, M_LINUX);
 	return (error);
 }
 
 int
 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
 {
 	struct ucred *cred;
 	l_gid_t *linux_gidset;
 	gid_t *bsd_gidset;
 	int bsd_gidsetsz, ngrp, error;
 
 	cred = td->td_ucred;
 	bsd_gidset = cred->cr_groups;
 	bsd_gidsetsz = cred->cr_ngroups - 1;
 
 	/*
 	 * cr_groups[0] holds egid. Returning the whole set
 	 * here will cause a duplicate. Exclude cr_groups[0]
 	 * to prevent that.
 	 */
 
 	if ((ngrp = args->gidsetsize) == 0) {
 		td->td_retval[0] = bsd_gidsetsz;
 		return (0);
 	}
 
 	if (ngrp < bsd_gidsetsz)
 		return (EINVAL);
 
 	ngrp = 0;
 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
 	    M_LINUX, M_WAITOK);
 	while (ngrp < bsd_gidsetsz) {
 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
 		ngrp++;
 	}
 
 	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
 	free(linux_gidset, M_LINUX);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = ngrp;
 	return (0);
 }
 
 int
 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
 {
 	struct rlimit bsd_rlim;
 	struct l_rlimit rlim;
 	u_int which;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(setrlimit))
 		printf(ARGS(setrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	error = copyin(args->rlim, &rlim, sizeof(rlim));
 	if (error)
 		return (error);
 
 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
 	return (kern_setrlimit(td, which, &bsd_rlim));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(old_getrlimit))
 		printf(ARGS(old_getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	lim_rlimit(td, which, &bsd_rlim);
 
 #ifdef COMPAT_LINUX32
 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == UINT_MAX)
 		rlim.rlim_cur = INT_MAX;
 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == UINT_MAX)
 		rlim.rlim_max = INT_MAX;
 #else
 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == ULONG_MAX)
 		rlim.rlim_cur = LONG_MAX;
 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == ULONG_MAX)
 		rlim.rlim_max = LONG_MAX;
 #endif
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(getrlimit))
 		printf(ARGS(getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	lim_rlimit(td, which, &bsd_rlim);
 
 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_sched_setscheduler(struct thread *td,
     struct linux_sched_setscheduler_args *args)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error, policy;
 
 #ifdef DEBUG
 	if (ldebug(sched_setscheduler))
 		printf(ARGS(sched_setscheduler, "%d, %d, %p"),
 		    args->pid, args->policy, (const void *)args->param);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = copyin(args->param, &sched_param, sizeof(sched_param));
 	if (error)
 		return (error);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	return (error);
 }
 
 int
 linux_sched_getscheduler(struct thread *td,
     struct linux_sched_getscheduler_args *args)
 {
 	struct thread *tdt;
 	int error, policy;
 
 #ifdef DEBUG
 	if (ldebug(sched_getscheduler))
 		printf(ARGS(sched_getscheduler, "%d"), args->pid);
 #endif
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_getscheduler(td, tdt, &policy);
 	PROC_UNLOCK(tdt->td_proc);
 
 	switch (policy) {
 	case SCHED_OTHER:
 		td->td_retval[0] = LINUX_SCHED_OTHER;
 		break;
 	case SCHED_FIFO:
 		td->td_retval[0] = LINUX_SCHED_FIFO;
 		break;
 	case SCHED_RR:
 		td->td_retval[0] = LINUX_SCHED_RR;
 		break;
 	}
 	return (error);
 }
 
 int
 linux_sched_get_priority_max(struct thread *td,
     struct linux_sched_get_priority_max_args *args)
 {
 	struct sched_get_priority_max_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_max))
 		printf(ARGS(sched_get_priority_max, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_max(td, &bsd));
 }
 
 int
 linux_sched_get_priority_min(struct thread *td,
     struct linux_sched_get_priority_min_args *args)
 {
 	struct sched_get_priority_min_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_min))
 		printf(ARGS(sched_get_priority_min, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_sched_get_priority_min(td, &bsd));
 }
 
 #define REBOOT_CAD_ON	0x89abcdef
 #define REBOOT_CAD_OFF	0
 #define REBOOT_HALT	0xcdef0123
 #define REBOOT_RESTART	0x01234567
 #define REBOOT_RESTART2	0xA1B2C3D4
 #define REBOOT_POWEROFF	0x4321FEDC
 #define REBOOT_MAGIC1	0xfee1dead
 #define REBOOT_MAGIC2	0x28121969
 #define REBOOT_MAGIC2A	0x05121996
 #define REBOOT_MAGIC2B	0x16041998
 
 int
 linux_reboot(struct thread *td, struct linux_reboot_args *args)
 {
 	struct reboot_args bsd_args;
 
 #ifdef DEBUG
 	if (ldebug(reboot))
 		printf(ARGS(reboot, "0x%x"), args->cmd);
 #endif
 
 	if (args->magic1 != REBOOT_MAGIC1)
 		return (EINVAL);
 
 	switch (args->magic2) {
 	case REBOOT_MAGIC2:
 	case REBOOT_MAGIC2A:
 	case REBOOT_MAGIC2B:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (args->cmd) {
 	case REBOOT_CAD_ON:
 	case REBOOT_CAD_OFF:
 		return (priv_check(td, PRIV_REBOOT));
 	case REBOOT_HALT:
 		bsd_args.opt = RB_HALT;
 		break;
 	case REBOOT_RESTART:
 	case REBOOT_RESTART2:
 		bsd_args.opt = 0;
 		break;
 	case REBOOT_POWEROFF:
 		bsd_args.opt = RB_POWEROFF;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (sys_reboot(td, &bsd_args));
 }
 
 
 /*
  * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
  * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
  * are assumed to be preserved. The following lightweight syscalls fixes
  * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
  *
  * linux_getpid() - MP SAFE
  * linux_getgid() - MP SAFE
  * linux_getuid() - MP SAFE
  */
 
 int
 linux_getpid(struct thread *td, struct linux_getpid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getpid))
 		printf(ARGS(getpid, ""));
 #endif
 	td->td_retval[0] = td->td_proc->p_pid;
 
 	return (0);
 }
 
 int
 linux_gettid(struct thread *td, struct linux_gettid_args *args)
 {
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(gettid))
 		printf(ARGS(gettid, ""));
 #endif
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
 
 	td->td_retval[0] = em->em_tid;
 
 	return (0);
 }
 
 
 int
 linux_getppid(struct thread *td, struct linux_getppid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getppid))
 		printf(ARGS(getppid, ""));
 #endif
 
 	PROC_LOCK(td->td_proc);
 	td->td_retval[0] = td->td_proc->p_pptr->p_pid;
 	PROC_UNLOCK(td->td_proc);
 	return (0);
 }
 
 int
 linux_getgid(struct thread *td, struct linux_getgid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getgid))
 		printf(ARGS(getgid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 	return (0);
 }
 
 int
 linux_getuid(struct thread *td, struct linux_getuid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getuid))
 		printf(ARGS(getuid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 	return (0);
 }
 
 
 int
 linux_getsid(struct thread *td, struct linux_getsid_args *args)
 {
 	struct getsid_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(getsid))
 		printf(ARGS(getsid, "%i"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	return (sys_getsid(td, &bsd));
 }
 
 int
 linux_nosys(struct thread *td, struct nosys_args *ignore)
 {
 
 	return (ENOSYS);
 }
 
 int
 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
 {
 	struct getpriority_args bsd_args;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(getpriority))
 		printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
 #endif
 
 	bsd_args.which = args->which;
 	bsd_args.who = args->who;
 	error = sys_getpriority(td, &bsd_args);
 	td->td_retval[0] = 20 - td->td_retval[0];
 	return (error);
 }
 
 int
 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(sethostname))
 		printf(ARGS(sethostname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
 	    args->len, 0, 0));
 }
 
 int
 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(setdomainname))
 		printf(ARGS(setdomainname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_NISDOMAINNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
 	    args->len, 0, 0));
 }
 
 int
 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(exit_group))
 		printf(ARGS(exit_group, "%i"), args->error_code);
 #endif
 
 	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
 	    args->error_code);
 
 	/*
 	 * XXX: we should send a signal to the parent if
 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
 	 * as it doesnt occur often.
 	 */
 	exit1(td, args->error_code, 0);
 		/* NOTREACHED */
 }
 
 #define _LINUX_CAPABILITY_VERSION  0x19980330
 
 struct l_user_cap_header {
 	l_int	version;
 	l_int	pid;
 };
 
 struct l_user_cap_data {
 	l_int	effective;
 	l_int	permitted;
 	l_int	inheritable;
 };
 
 int
 linux_capget(struct thread *td, struct linux_capget_args *args)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd;
 	int error;
 
 	if (args->hdrp == NULL)
 		return (EFAULT);
 
 	error = copyin(args->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	if (luch.version != _LINUX_CAPABILITY_VERSION) {
 		luch.version = _LINUX_CAPABILITY_VERSION;
 		error = copyout(&luch, args->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	if (args->datap) {
 		/*
 		 * The current implementation doesn't support setting
 		 * a capability (it's essentially a stub) so indicate
 		 * that no capabilities are currently set or available
 		 * to request.
 		 */
 		bzero (&lucd, sizeof(lucd));
 		error = copyout(&lucd, args->datap, sizeof(lucd));
 	}
 
 	return (error);
 }
 
 int
 linux_capset(struct thread *td, struct linux_capset_args *args)
 {
 	struct l_user_cap_header luch;
 	struct l_user_cap_data lucd;
 	int error;
 
 	if (args->hdrp == NULL || args->datap == NULL)
 		return (EFAULT);
 
 	error = copyin(args->hdrp, &luch, sizeof(luch));
 	if (error != 0)
 		return (error);
 
 	if (luch.version != _LINUX_CAPABILITY_VERSION) {
 		luch.version = _LINUX_CAPABILITY_VERSION;
 		error = copyout(&luch, args->hdrp, sizeof(luch));
 		if (error)
 			return (error);
 		return (EINVAL);
 	}
 
 	if (luch.pid)
 		return (EPERM);
 
 	error = copyin(args->datap, &lucd, sizeof(lucd));
 	if (error != 0)
 		return (error);
 
 	/* We currently don't support setting any capabilities. */
 	if (lucd.effective || lucd.permitted || lucd.inheritable) {
 		linux_msg(td,
 			  "capset effective=0x%x, permitted=0x%x, "
 			  "inheritable=0x%x is not implemented",
 			  (int)lucd.effective, (int)lucd.permitted,
 			  (int)lucd.inheritable);
 		return (EPERM);
 	}
 
 	return (0);
 }
 
 int
 linux_prctl(struct thread *td, struct linux_prctl_args *args)
 {
 	int error = 0, max_size;
 	struct proc *p = td->td_proc;
 	char comm[LINUX_MAX_COMM_LEN];
 	struct linux_emuldata *em;
 	int pdeath_signal;
 
 #ifdef DEBUG
 	if (ldebug(prctl))
 		printf(ARGS(prctl, "%d, %ju, %ju, %ju, %ju"), args->option,
 		    (uintmax_t)args->arg2, (uintmax_t)args->arg3,
 		    (uintmax_t)args->arg4, (uintmax_t)args->arg5);
 #endif
 
 	switch (args->option) {
 	case LINUX_PR_SET_PDEATHSIG:
 		if (!LINUX_SIG_VALID(args->arg2))
 			return (EINVAL);
 		em = em_find(td);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		em->pdeath_signal = args->arg2;
 		break;
 	case LINUX_PR_GET_PDEATHSIG:
 		em = em_find(td);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		pdeath_signal = em->pdeath_signal;
 		error = copyout(&pdeath_signal,
 		    (void *)(register_t)args->arg2,
 		    sizeof(pdeath_signal));
 		break;
 	case LINUX_PR_GET_KEEPCAPS:
 		/*
 		 * Indicate that we always clear the effective and
 		 * permitted capability sets when the user id becomes
 		 * non-zero (actually the capability sets are simply
 		 * always zero in the current implementation).
 		 */
 		td->td_retval[0] = 0;
 		break;
 	case LINUX_PR_SET_KEEPCAPS:
 		/*
 		 * Ignore requests to keep the effective and permitted
 		 * capability sets when the user id becomes non-zero.
 		 */
 		break;
 	case LINUX_PR_SET_NAME:
 		/*
 		 * To be on the safe side we need to make sure to not
 		 * overflow the size a linux program expects. We already
 		 * do this here in the copyin, so that we don't need to
 		 * check on copyout.
 		 */
 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
 		error = copyinstr((void *)(register_t)args->arg2, comm,
 		    max_size, NULL);
 
 		/* Linux silently truncates the name if it is too long. */
 		if (error == ENAMETOOLONG) {
 			/*
 			 * XXX: copyinstr() isn't documented to populate the
 			 * array completely, so do a copyin() to be on the
 			 * safe side. This should be changed in case
 			 * copyinstr() is changed to guarantee this.
 			 */
 			error = copyin((void *)(register_t)args->arg2, comm,
 			    max_size - 1);
 			comm[max_size - 1] = '\0';
 		}
 		if (error)
 			return (error);
 
 		PROC_LOCK(p);
 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
 		PROC_UNLOCK(p);
 		break;
 	case LINUX_PR_GET_NAME:
 		PROC_LOCK(p);
 		strlcpy(comm, p->p_comm, sizeof(comm));
 		PROC_UNLOCK(p);
 		error = copyout(comm, (void *)(register_t)args->arg2,
 		    strlen(comm) + 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 int
 linux_sched_setparam(struct thread *td,
     struct linux_sched_setparam_args *uap)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_setparam))
 		printf(ARGS(sched_setparam, "%d, *"), uap->pid);
 #endif
 
 	error = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (error)
 		return (error);
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_setparam(td, tdt, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	return (error);
 }
 
 int
 linux_sched_getparam(struct thread *td,
     struct linux_sched_getparam_args *uap)
 {
 	struct sched_param sched_param;
 	struct thread *tdt;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_getparam))
 		printf(ARGS(sched_getparam, "%d, *"), uap->pid);
 #endif
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_getparam(td, tdt, &sched_param);
 	PROC_UNLOCK(tdt->td_proc);
 	if (error == 0)
 		error = copyout(&sched_param, uap->param,
 		    sizeof(sched_param));
 	return (error);
 }
 
 /*
  * Get affinity of a process.
  */
 int
 linux_sched_getaffinity(struct thread *td,
     struct linux_sched_getaffinity_args *args)
 {
 	int error;
 	struct thread *tdt;
 	struct cpuset_getaffinity_args cga;
 
 #ifdef DEBUG
 	if (ldebug(sched_getaffinity))
 		printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	PROC_UNLOCK(tdt->td_proc);
 	cga.level = CPU_LEVEL_WHICH;
 	cga.which = CPU_WHICH_TID;
 	cga.id = tdt->td_tid;
 	cga.cpusetsize = sizeof(cpuset_t);
 	cga.mask = (cpuset_t *) args->user_mask_ptr;
 
 	if ((error = sys_cpuset_getaffinity(td, &cga)) == 0)
 		td->td_retval[0] = sizeof(cpuset_t);
 
 	return (error);
 }
 
 /*
  *  Set affinity of a process.
  */
 int
 linux_sched_setaffinity(struct thread *td,
     struct linux_sched_setaffinity_args *args)
 {
 	struct cpuset_setaffinity_args csa;
 	struct thread *tdt;
 
 #ifdef DEBUG
 	if (ldebug(sched_setaffinity))
 		printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
 		    args->len);
 #endif
 	if (args->len < sizeof(cpuset_t))
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, args->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	PROC_UNLOCK(tdt->td_proc);
 	csa.level = CPU_LEVEL_WHICH;
 	csa.which = CPU_WHICH_TID;
 	csa.id = tdt->td_tid;
 	csa.cpusetsize = sizeof(cpuset_t);
 	csa.mask = (cpuset_t *) args->user_mask_ptr;
 
 	return (sys_cpuset_setaffinity(td, &csa));
 }
 
 struct linux_rlimit64 {
 	uint64_t	rlim_cur;
 	uint64_t	rlim_max;
 };
 
 int
 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
 {
 	struct rlimit rlim, nrlim;
 	struct linux_rlimit64 lrlim;
 	struct proc *p;
 	u_int which;
 	int flags;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(prlimit64))
 		printf(ARGS(prlimit64, "%d, %d, %p, %p"), args->pid,
 		    args->resource, (void *)args->new, (void *)args->old);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	if (args->new != NULL) {
 		/*
 		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
 		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
 		 * as INFINITY so we do not need a conversion even.
 		 */
 		error = copyin(args->new, &nrlim, sizeof(nrlim));
 		if (error != 0)
 			return (error);
 	}
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (args->new != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget(args->pid, flags, &p);
 	if (error != 0)
 		return (error);
 
 	if (args->old != NULL) {
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, which, &rlim);
 		PROC_UNLOCK(p);
 		if (rlim.rlim_cur == RLIM_INFINITY)
 			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
 		else
 			lrlim.rlim_cur = rlim.rlim_cur;
 		if (rlim.rlim_max == RLIM_INFINITY)
 			lrlim.rlim_max = LINUX_RLIM_INFINITY;
 		else
 			lrlim.rlim_max = rlim.rlim_max;
 		error = copyout(&lrlim, args->old, sizeof(lrlim));
 		if (error != 0)
 			goto out;
 	}
 
 	if (args->new != NULL)
 		error = kern_proc_setrlimit(td, p, which, &nrlim);
 
  out:
 	PRELE(p);
 	return (error);
 }
 
 int
 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
 {
 	struct timeval utv, tv0, tv1, *tvp;
 	struct l_pselect6arg lpse6;
 	struct l_timespec lts;
 	struct timespec uts;
 	l_sigset_t l_ss;
 	sigset_t *ssp;
 	sigset_t ss;
 	int error;
 
 	ssp = NULL;
 	if (args->sig != NULL) {
 		error = copyin(args->sig, &lpse6, sizeof(lpse6));
 		if (error != 0)
 			return (error);
 		if (lpse6.ss_len != sizeof(l_ss))
 			return (EINVAL);
 		if (lpse6.ss != 0) {
 			error = copyin(PTRIN(lpse6.ss), &l_ss,
 			    sizeof(l_ss));
 			if (error != 0)
 				return (error);
 			linux_to_bsd_sigset(&l_ss, &ss);
 			ssp = &ss;
 		}
 	}
 
 	/*
 	 * Currently glibc changes nanosecond number to microsecond.
 	 * This mean losing precision but for now it is hardly seen.
 	 */
 	if (args->tsp != NULL) {
 		error = copyin(args->tsp, &lts, sizeof(lts));
 		if (error != 0)
 			return (error);
 		error = linux_to_native_timespec(&uts, &lts);
 		if (error != 0)
 			return (error);
 
 		TIMESPEC_TO_TIMEVAL(&utv, &uts);
 		if (itimerfix(&utv))
 			return (EINVAL);
 
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_pselect(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp, ssp, LINUX_NFDBITS);
 
 	if (error == 0 && args->tsp != NULL) {
 		if (td->td_retval[0] != 0) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 
 		TIMEVAL_TO_TIMESPEC(&utv, &uts);
 
 		native_to_linux_timespec(&lts, &uts);
 		error = copyout(&lts, args->tsp, sizeof(lts));
 	}
 
 	return (error);
 }
 
 int
 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
 {
 	struct timespec ts0, ts1;
 	struct l_timespec lts;
 	struct timespec uts, *tsp;
 	l_sigset_t l_ss;
 	sigset_t *ssp;
 	sigset_t ss;
 	int error;
 
 	if (args->sset != NULL) {
 		if (args->ssize != sizeof(l_ss))
 			return (EINVAL);
 		error = copyin(args->sset, &l_ss, sizeof(l_ss));
 		if (error)
 			return (error);
 		linux_to_bsd_sigset(&l_ss, &ss);
 		ssp = &ss;
 	} else
 		ssp = NULL;
 	if (args->tsp != NULL) {
 		error = copyin(args->tsp, &lts, sizeof(lts));
 		if (error)
 			return (error);
 		error = linux_to_native_timespec(&uts, &lts);
 		if (error != 0)
 			return (error);
 
 		nanotime(&ts0);
 		tsp = &uts;
 	} else
 		tsp = NULL;
 
 	error = kern_poll(td, args->fds, args->nfds, tsp, ssp);
 
 	if (error == 0 && args->tsp != NULL) {
 		if (td->td_retval[0]) {
 			nanotime(&ts1);
 			timespecsub(&ts1, &ts0);
 			timespecsub(&uts, &ts1);
 			if (uts.tv_sec < 0)
 				timespecclear(&uts);
 		} else
 			timespecclear(&uts);
 
 		native_to_linux_timespec(&lts, &uts);
 		error = copyout(&lts, args->tsp, sizeof(lts));
 	}
 
 	return (error);
 }
 
 #if defined(DEBUG) || defined(KTR)
 /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */
 
 #ifdef COMPAT_LINUX32
 #define	L_MAXSYSCALL	LINUX32_SYS_MAXSYSCALL
 #else
 #define	L_MAXSYSCALL	LINUX_SYS_MAXSYSCALL
 #endif
 
 u_char linux_debug_map[howmany(L_MAXSYSCALL, sizeof(u_char))];
 
 static int
 linux_debug(int syscall, int toggle, int global)
 {
 
 	if (global) {
 		char c = toggle ? 0 : 0xff;
 
 		memset(linux_debug_map, c, sizeof(linux_debug_map));
 		return (0);
 	}
 	if (syscall < 0 || syscall >= L_MAXSYSCALL)
 		return (EINVAL);
 	if (toggle)
 		clrbit(linux_debug_map, syscall);
 	else
 		setbit(linux_debug_map, syscall);
 	return (0);
 }
 #undef L_MAXSYSCALL
 
 /*
  * Usage: sysctl linux.debug=<syscall_nr>.<0/1>
  *
  *    E.g.: sysctl linux.debug=21.0
  *
  * As a special case, syscall "all" will apply to all syscalls globally.
  */
 #define LINUX_MAX_DEBUGSTR	16
 int
 linux_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	char value[LINUX_MAX_DEBUGSTR], *p;
 	int error, sysc, toggle;
 	int global = 0;
 
 	value[0] = '\0';
 	error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	for (p = value; *p != '\0' && *p != '.'; p++);
 	if (*p == '\0')
 		return (EINVAL);
 	*p++ = '\0';
 	sysc = strtol(value, NULL, 0);
 	toggle = strtol(p, NULL, 0);
 	if (strcmp(value, "all") == 0)
 		global = 1;
 	error = linux_debug(sysc, toggle, global);
 	return (error);
 }
 
 #endif /* DEBUG || KTR */
 
 int
 linux_sched_rr_get_interval(struct thread *td,
     struct linux_sched_rr_get_interval_args *uap)
 {
 	struct timespec ts;
 	struct l_timespec lts;
 	struct thread *tdt;
 	int error;
 
 	/*
 	 * According to man in case the invalid pid specified
 	 * EINVAL should be returned.
 	 */
 	if (uap->pid < 0)
 		return (EINVAL);
 
 	tdt = linux_tdfind(td, uap->pid, -1);
 	if (tdt == NULL)
 		return (ESRCH);
 
 	error = kern_sched_rr_get_interval_td(td, tdt, &ts);
 	PROC_UNLOCK(tdt->td_proc);
 	if (error != 0)
 		return (error);
 	native_to_linux_timespec(&lts, &ts);
 	return (copyout(&lts, uap->interval, sizeof(lts)));
 }
 
 /*
  * In case when the Linux thread is the initial thread in
  * the thread group thread id is equal to the process id.
  * Glibc depends on this magic (assert in pthread_getattr_np.c).
  */
 struct thread *
 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
 {
 	struct linux_emuldata *em;
 	struct thread *tdt;
 	struct proc *p;
 
 	tdt = NULL;
 	if (tid == 0 || tid == td->td_tid) {
 		tdt = td;
 		PROC_LOCK(tdt->td_proc);
 	} else if (tid > PID_MAX)
 		tdt = tdfind(tid, pid);
 	else {
 		/*
 		 * Initial thread where the tid equal to the pid.
 		 */
 		p = pfind(tid);
 		if (p != NULL) {
 			if (SV_PROC_ABI(p) != SV_ABI_LINUX) {
 				/*
 				 * p is not a Linuxulator process.
 				 */
 				PROC_UNLOCK(p);
 				return (NULL);
 			}
 			FOREACH_THREAD_IN_PROC(p, tdt) {
 				em = em_find(tdt);
 				if (tid == em->em_tid)
 					return (tdt);
 			}
 			PROC_UNLOCK(p);
 		}
 		return (NULL);
 	}
 
 	return (tdt);
 }
 
 void
 linux_to_bsd_waitopts(int options, int *bsdopts)
 {
 
 	if (options & LINUX_WNOHANG)
 		*bsdopts |= WNOHANG;
 	if (options & LINUX_WUNTRACED)
 		*bsdopts |= WUNTRACED;
 	if (options & LINUX_WEXITED)
 		*bsdopts |= WEXITED;
 	if (options & LINUX_WCONTINUED)
 		*bsdopts |= WCONTINUED;
 	if (options & LINUX_WNOWAIT)
 		*bsdopts |= WNOWAIT;
 
 	if (options & __WCLONE)
 		*bsdopts |= WLINUXCLONE;
 }
Index: stable/11/sys/compat/linux/linux_mmap.c
===================================================================
--- stable/11/sys/compat/linux/linux_mmap.c	(revision 314333)
+++ stable/11/sys/compat/linux/linux_mmap.c	(revision 314334)
@@ -1,257 +1,236 @@
 /*-
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2000 Marcel Moolenaar
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/capsicum.h>
 #include <sys/file.h>
 #include <sys/imgact.h>
 #include <sys/ktr.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 
 #include <vm/pmap.h>
+#include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_mmap.h>
 #include <compat/linux/linux_persona.h>
 #include <compat/linux/linux_util.h>
 
 
 #define STACK_SIZE  (2 * 1024 * 1024)
 #define GUARD_SIZE  (4 * PAGE_SIZE)
 
 #if defined(__amd64__)
 static void linux_fixup_prot(struct thread *td, int *prot);
 #endif
 
 
 int
 linux_mmap_common(struct thread *td, uintptr_t addr, size_t len, int prot,
     int flags, int fd, off_t pos)
 {
 	struct proc *p = td->td_proc;
 	struct vmspace *vms = td->td_proc->p_vmspace;
-	struct mmap_args /* {
-		caddr_t addr;
-		size_t len;
-		int prot;
-		int flags;
-		int fd;
-		off_t pos;
-	} */ bsd_args;
-	int error;
+	int bsd_flags, error;
 	struct file *fp;
 
 	cap_rights_t rights;
 	LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx",
 	    addr, len, prot, flags, fd, pos);
 
 	error = 0;
-	bsd_args.flags = 0;
+	bsd_flags = 0;
 	fp = NULL;
 
 	/*
 	 * Linux mmap(2):
 	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
 	 */
 	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
 		return (EINVAL);
 
 	if (flags & LINUX_MAP_SHARED)
-		bsd_args.flags |= MAP_SHARED;
+		bsd_flags |= MAP_SHARED;
 	if (flags & LINUX_MAP_PRIVATE)
-		bsd_args.flags |= MAP_PRIVATE;
+		bsd_flags |= MAP_PRIVATE;
 	if (flags & LINUX_MAP_FIXED)
-		bsd_args.flags |= MAP_FIXED;
+		bsd_flags |= MAP_FIXED;
 	if (flags & LINUX_MAP_ANON) {
 		/* Enforce pos to be on page boundary, then ignore. */
 		if ((pos & PAGE_MASK) != 0)
 			return (EINVAL);
 		pos = 0;
-		bsd_args.flags |= MAP_ANON;
+		bsd_flags |= MAP_ANON;
 	} else
-		bsd_args.flags |= MAP_NOSYNC;
+		bsd_flags |= MAP_NOSYNC;
 	if (flags & LINUX_MAP_GROWSDOWN)
-		bsd_args.flags |= MAP_STACK;
+		bsd_flags |= MAP_STACK;
 
 	/*
 	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
 	 * on Linux/i386 if the binary requires executable stack.
 	 * We do this only for IA32 emulation as on native i386 this is does not
 	 * make sense without PAE.
 	 *
 	 * XXX. Linux checks that the file system is not mounted with noexec.
 	 */
-	bsd_args.prot = prot;
 #if defined(__amd64__)
-	linux_fixup_prot(td, &bsd_args.prot);
+	linux_fixup_prot(td, &prot);
 #endif
 
 	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
-	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
-	if (bsd_args.fd != -1) {
+	fd = (bsd_flags & MAP_ANON) ? -1 : fd;
+	if (fd != -1) {
 		/*
 		 * Linux follows Solaris mmap(2) description:
 		 * The file descriptor fildes is opened with
 		 * read permission, regardless of the
 		 * protection options specified.
 		 */
 
-		error = fget(td, bsd_args.fd,
-		    cap_rights_init(&rights, CAP_MMAP), &fp);
+		error = fget(td, fd, cap_rights_init(&rights, CAP_MMAP), &fp);
 		if (error != 0)
 			return (error);
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 
 		/* Linux mmap() just fails for O_WRONLY files */
 		if (!(fp->f_flag & FREAD)) {
 			fdrop(fp, td);
 			return (EACCES);
 		}
 
 		fdrop(fp, td);
 	}
 
 	if (flags & LINUX_MAP_GROWSDOWN) {
 		/*
 		 * The Linux MAP_GROWSDOWN option does not limit auto
 		 * growth of the region.  Linux mmap with this option
 		 * takes as addr the initial BOS, and as len, the initial
 		 * region size.  It can then grow down from addr without
 		 * limit.  However, Linux threads has an implicit internal
 		 * limit to stack size of STACK_SIZE.  Its just not
 		 * enforced explicitly in Linux.  But, here we impose
 		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
 		 * region, since we can do this with our mmap.
 		 *
 		 * Our mmap with MAP_STACK takes addr as the maximum
 		 * downsize limit on BOS, and as len the max size of
 		 * the region.  It then maps the top SGROWSIZ bytes,
 		 * and auto grows the region down, up to the limit
 		 * in addr.
 		 *
 		 * If we don't use the MAP_STACK option, the effect
 		 * of this code is to allocate a stack region of a
 		 * fixed size of (STACK_SIZE - GUARD_SIZE).
 		 */
 
 		if ((caddr_t)addr + len > vms->vm_maxsaddr) {
 			/*
 			 * Some Linux apps will attempt to mmap
 			 * thread stacks near the top of their
 			 * address space.  If their TOS is greater
 			 * than vm_maxsaddr, vm_map_growstack()
 			 * will confuse the thread stack with the
 			 * process stack and deliver a SEGV if they
 			 * attempt to grow the thread stack past their
 			 * current stacksize rlimit.  To avoid this,
 			 * adjust vm_maxsaddr upwards to reflect
 			 * the current stacksize rlimit rather
 			 * than the maximum possible stacksize.
 			 * It would be better to adjust the
 			 * mmap'ed region, but some apps do not check
 			 * mmap's return value.
 			 */
 			PROC_LOCK(p);
 			vms->vm_maxsaddr = (char *)p->p_sysent->sv_usrstack -
 			    lim_cur_proc(p, RLIMIT_STACK);
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * This gives us our maximum stack size and a new BOS.
 		 * If we're using VM_STACK, then mmap will just map
 		 * the top SGROWSIZ bytes, and let the stack grow down
 		 * to the limit at BOS.  If we're not using VM_STACK
 		 * we map the full stack, since we don't have a way
 		 * to autogrow it.
 		 */
-		if (len > STACK_SIZE - GUARD_SIZE) {
-			bsd_args.addr = (caddr_t)addr;
-			bsd_args.len = len;
-		} else {
-			bsd_args.addr = (caddr_t)addr -
-			    (STACK_SIZE - GUARD_SIZE - len);
-			bsd_args.len = STACK_SIZE - GUARD_SIZE;
+		if (len <= STACK_SIZE - GUARD_SIZE) {
+			addr = addr - (STACK_SIZE - GUARD_SIZE - len);
+			len = STACK_SIZE - GUARD_SIZE;
 		}
-	} else {
-		bsd_args.addr = (caddr_t)addr;
-		bsd_args.len  = len;
 	}
-	bsd_args.pos = pos;
 
-	error = sys_mmap(td, &bsd_args);
+	error = kern_mmap(td, addr, len, prot, bsd_flags, fd, pos);
 
 	LINUX_CTR2(mmap2, "return: %d (%p)", error, td->td_retval[0]);
 
 	return (error);
 }
 
 int
 linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot)
 {
-	struct mprotect_args bsd_args;
 
-	bsd_args.addr = (void *)addr;
-	bsd_args.len = len;
-	bsd_args.prot = prot;
-
 #if defined(__amd64__)
-	linux_fixup_prot(td, &bsd_args.prot);
+	linux_fixup_prot(td, &prot);
 #endif
-	return (sys_mprotect(td, &bsd_args));
+	return (kern_mprotect(td, addr, len, prot));
 }
 
 #if defined(__amd64__)
 static void
 linux_fixup_prot(struct thread *td, int *prot)
 {
 	struct linux_pemuldata *pem;
 
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) {
 		pem = pem_find(td->td_proc);
 		if (pem->persona & LINUX_READ_IMPLIES_EXEC)
 			*prot |= PROT_EXEC;
 	}
 
 }
 #endif
Index: stable/11/sys/kern/kern_descrip.c
===================================================================
--- stable/11/sys/kern/kern_descrip.c	(revision 314333)
+++ stable/11/sys/kern/kern_descrip.c	(revision 314334)
@@ -1,4105 +1,4108 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static uma_zone_t file_zone;
 static uma_zone_t filedesc0_zone;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, int holdleaders);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static int	fd_last_used(struct filedesc *fdp, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct thread *td);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  * not exceeding size - 1. Return -1 if not found.
  */
 static int
 fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fdefree_last(fde);
 	fde->fde_file = NULL;
 	fdunused(fdp, fd);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct filedesc *fdp;
 
 	fdp = curproc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_cdir == NULL) {
 		fdp->fd_cdir = rootvnode;
 		vrefact(rootvnode);
 	}
 	if (fdp->fd_rdir == NULL) {
 		fdp->fd_rdir = rootvnode;
 		vrefact(rootvnode);
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] =
 	    min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error, flg, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
 		if (fget_locked(fdp, fd) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		td->td_retval[0] =
 		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
 		if (fget_locked(fdp, fd) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		cap_rights_init(&rights, CAP_FLOCK);
 		error = fget_unlocked(fdp, fd, &rights, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			/*
 			 * Temporary api for testing remote lock
 			 * infrastructure.
 			 */
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &rights, &fp2, NULL);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FLOCK), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
+	AUDIT_ARG_FD(old);
+	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
+
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL)
 		goto unlock;
 	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0)
 			goto unlock;
 		break;
 	case FDDUP_MUSTREPLACE:
 		/* Target file descriptor must exist. */
 		if (fget_locked(fdp, new) == NULL)
 			goto unlock;
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(p);
 				error = racct_set(p, RACCT_NOFILE, new + 1);
 				PROC_UNLOCK(p);
 				if (error != 0) {
 					error = EMFILE;
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	oldfde = &fdp->fd_ofiles[old];
 	fhold(oldfde->fde_file);
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seq_write_begin(&newfde->fde_seq);
 #endif
 	filecaps_free(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seq_write_end(&newfde->fde_seq);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, 1);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	return (error);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	if (*sigiop == NULL)
 		return;
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		free(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Function drops the filedesc lock on return.
  */
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     int holdleaders)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = 0;
 		}
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 {
 	struct filedesc *fdp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (0);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrstat(sbp);
 #endif
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 	if (error != 0)
 		return (error);
 
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Initialize filecaps structure.
  */
 void
 filecaps_init(struct filecaps *fcaps)
 {
 
 	bzero(fcaps, sizeof(*fcaps));
 	fcaps->fc_nioctls = -1;
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 int
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	*dst = *src;
 	if (src->fc_ioctls == NULL)
 		return (0);
 	if (!locked)
 		return (1);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 	return (0);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	/*
 	 * If lastfile is -1 this struct filedesc was just allocated and we are
 	 * growing it to accommodate for the one we are going to copy from. There
 	 * is no need to have a lock on this one as it's not visible to anyone.
 	 */
 	if (fdp->fd_lastfile != -1)
 		FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Do not free the old file table, as some threads may still
 	 * reference entries within it.  Instead, place it on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
 		ft->ft_table = otable;
 		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (fd >= maxfd)
 		return (EMFILE);
 	if (fd >= fdp->fd_nfiles) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			error = racct_set(p, RACCT_NOFILE, allocfd);
 			PROC_UNLOCK(p);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);		/* no reference held on error */
 
 	error = finstall(td, fp, &fd, flags, fcaps);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
 	}
 
 	if (resultfp != NULL)
 		*resultfp = fp;		/* copy out result */
 	else
 		fdrop(fp, td);		/* release local reference */
 
 	if (resultfd != NULL)
 		*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 falloc_noinstall(struct thread *td, struct file **resultfp)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	int openfiles_new;
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 
 	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 	if ((openfiles_new >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles_new >= maxfiles) {
 		atomic_subtract_int(&openfiles, 1);
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	fhold(fp);
 	_finstall(fdp, fp, *fd, flags, fcaps);
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_cmask = CMASK;
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_lastfile = -1;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 
 	FILEDESC_SLOCK(fdp);
 	newfdp->fd_cdir = fdp->fd_cdir;
 	if (newfdp->fd_cdir)
 		vrefact(newfdp->fd_cdir);
 	newfdp->fd_rdir = fdp->fd_rdir;
 	if (newfdp->fd_rdir)
 		vrefact(newfdp->fd_rdir);
 	newfdp->fd_jdir = fdp->fd_jdir;
 	if (newfdp->fd_jdir)
 		vrefact(newfdp->fd_jdir);
 
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 	} else {
 		while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 			FILEDESC_SUNLOCK(fdp);
 			fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 
 	return (newfdp);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (fdp->fd_holdcnt > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (p->p_fd->fd_refcnt == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 void
 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 {
 
 	fdescfree(td);
 	td->td_proc->p_fd = fdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copies a filedesc structure, while remapping all file descriptors
  * stored inside using a translation table.
  *
  * File descriptors are copied over to the new file descriptor table,
  * regardless of whether the close-on-exec flag is set.
  */
 int
 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
     struct filedesc **ret)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int error, i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	if (nfds > fdp->fd_lastfile + 1) {
 		/* New table cannot be larger than the old one. */
 		error = E2BIG;
 		goto bad;
 	}
 	/* Copy all passable descriptors (i.e. not kqueue). */
 	newfdp->fd_freefile = nfds;
 	for (i = 0; i < nfds; ++i) {
 		if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
 			/* File descriptor out of bounds. */
 			error = EBADF;
 			goto bad;
 		}
 		ofde = &fdp->fd_ofiles[fds[i]];
 		if (ofde->fde_file == NULL) {
 			/* Unused file descriptor. */
 			error = EBADF;
 			goto bad;
 		}
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			/* File descriptor cannot be passed. */
 			error = EINVAL;
 			goto bad;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	*ret = newfdp;
 	return (0);
 bad:
 	FILEDESC_SUNLOCK(fdp);
 	fdescfree_remapped(newfdp);
 	return (error);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		for (i = 0; i <= fdp->fd_lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE)
 				continue;
 			fhold(fp);
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			if (needclose)
 				(void) closef(fp, td);
 			else
 				fdrop(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 	struct vnode *cdir, *jdir, *rdir;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_set(p, RACCT_NOFILE, 0);
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	PROC_LOCK(p);
 	p->p_fd = NULL;
 	PROC_UNLOCK(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	FILEDESC_XLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	if (cdir != NULL)
 		vrele(cdir);
 	if (rdir != NULL)
 		vrele(rdir);
 	if (jdir != NULL)
 		vrele(jdir);
 
 	fdescfree_fds(td, fdp, 1);
 }
 
 void
 fdescfree_remapped(struct filedesc *fdp)
 {
 
 	if (fdp->fd_cdir != NULL)
 		vrele(fdp->fd_cdir);
 	if (fdp->fd_rdir != NULL)
 		vrele(fdp->fd_rdir);
 	if (fdp->fd_jdir != NULL)
 		vrele(fdp->fd_jdir);
 
 	fdescfree_fds(curthread, fdp, 0);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, 0);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seq_t *seqp)
 {
 #ifdef CAPABILITIES
 	struct filedescent *fde;
 #endif
 	struct fdescenttbl *fdt;
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
 	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if ((u_int)fd >= fdt->fdt_nfiles)
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seq_read(fd_seq(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde(fde);
 		fp = fde->fde_file;
 		if (!seq_consistent(fd_seq(fdt, fd), seq))
 			continue;
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		error = cap_check(&haverights, needrightsp);
 		if (error != 0)
 			return (error);
 #endif
 	retry:
 		count = fp->f_count;
 		if (count == 0) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it possible that
 			 * there is a stale fp pointer in cached version.
 			 */
 			fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)
 			goto retry;
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 #ifdef CAPABILITIES
 		*seqp = seq;
 #endif
 	}
 	return (0);
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp, seq_t *seqp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp, NULL);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #else
 	struct filedesc *fdp = td->td_proc->p_fd;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = _fget(td, fd, fpp, 0, rightsp, &seq);
 		if (error != 0)
 			return (error);
 		/*
 		 * If requested, convert capability rights to access flags.
 		 */
 		if (maxprotp != NULL)
 			*maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd));
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 #endif
 	return (error);
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 #ifndef CAPABILITIES
 	return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
 #else
 	int error;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 	if (error != 0) {
 		fdrop(*fpp, td);
 		*fpp = NULL;
 	}
 	return (error);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vrefact(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 #ifdef CAPABILITIES
 	int error;
 #endif
 
 	fdp = td->td_proc->p_fd;
 	fp = fget_locked(fdp, fd);
 	if (fp == NULL || fp->f_ops == &badfileops)
 		return (EBADF);
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, fd), needrightsp);
 	if (error != 0)
 		return (error);
 #endif
 
 	if (fp->f_vnode == NULL)
 		return (EINVAL);
 
 	*vpp = fp->f_vnode;
 	vrefact(*vpp);
 	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps, true);
 
 	return (0);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if the
  * descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
  * in the future.
  *
  * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being free'd
  * during use.
  */
 int
 fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
     u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * Note: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fhold(fp);
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
  * authorize this operation.
  */
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	vrefact(vp);
 	fdp->fd_rdir = vp;
 	if (fdp->fd_jdir == NULL) {
 		vrefact(vp);
 		fdp->fd_jdir = vp;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("chdir to a vnode with zero usecount"));
 	oldvp = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_jdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_jdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrefact(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vrefact(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vrefact(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	struct filedesc *fdp;
 	int i, count, slots;
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	fdp = curproc->p_fd;
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	slots = NDSLOTS(fdp->fd_lastfile + 1);
 	for (i = 0; i < slots; i++)
 		count += bitcountl(fdp->fd_map[i]);
 	FILEDESC_SUNLOCK(fdp);
 
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 static void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = fp->f_count;
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SUNLOCK(efbuf->fdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int error, i;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vrefact(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vrefact(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vrefact(cttyvp);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vrefact(fdp->fd_cdir);
 		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vrefact(fdp->fd_rdir);
 		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vrefact(fdp->fd_jdir);
 		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 	}
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		cap_rights_init(&rights);
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	okif->kf_vnode_type = kif->kf_vnode_type;
 	okif->kf_sock_domain = kif->kf_sock_domain;
 	okif->kf_sock_type = kif->kf_sock_type;
 	okif->kf_sock_protocol = kif->kf_sock_protocol;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	okif->kf_sa_local = kif->kf_sa_local;
 	okif->kf_sa_peer = kif->kf_sa_peer;
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
 {
 	int error;
 
 	vrefact(vp);
 	FILEDESC_SUNLOCK(fdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct file *fp;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 		    okif, fdp, req);
 	if (fdp->fd_rdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 		    okif, fdp, req);
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 		    okif, fdp, req);
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = fdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir == NULL)
 		error = EINVAL;
 	else {
 		vrefact(fdp->fd_cdir);
 		error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
 		    FREAD, efbuf);
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Index: stable/11/sys/kern/sys_generic.c
===================================================================
--- stable/11/sys/kern/sys_generic.c	(revision 314333)
+++ stable/11/sys/kern/sys_generic.c	(revision 314334)
@@ -1,1948 +1,1951 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 
 /*
  * The following macro defines how many bytes will be allocated from
  * the stack instead of memory allocated when passing the IOCTL data
  * structures from userspace and to the kernel. Some IOCTLs having
  * small data structures are used very frequently and this small
  * buffer on the stack gives a significant speedup improvement for
  * those requests. The value of this define should be greater or equal
  * to 64 bytes and should also be power of two. The data structure is
  * currently hard-aligned to a 8-byte boundary on the stack. This
  * should currently be sufficient for all supported platforms.
  */
 #define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
 #define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
 
 #ifdef __LP64__
 static int iosize_max_clamp = 0;
 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
     &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
 static int devfs_iosize_max_clamp = 1;
 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
     &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
 #endif
 
 /*
  * Assert that the return value of read(2) and write(2) syscalls fits
  * into a register.  If not, an architecture will need to provide the
  * usermode wrappers to reconstruct the result.
  */
 CTASSERT(sizeof(register_t) >= sizeof(size_t));
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
 		    u_int);
 static int	pollscan(struct thread *, struct pollfd *, u_int);
 static int	pollrescan(struct thread *);
 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
 static void	selfdalloc(struct thread *, void *);
 static void	selfdfree(struct seltd *, struct selfd *);
 static int	dofileread(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 static void	seltdinit(struct thread *);
 static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
 static void	seltdclear(struct thread *);
 
 /*
  * One seltd per-thread allocated on demand as needed.
  *
  *	t - protected by st_mtx
  * 	k - Only accessed by curthread or read-only
  */
 struct seltd {
 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
 	struct selfd		*st_free1;	/* (k) free fd for read set. */
 	struct selfd		*st_free2;	/* (k) free fd for write set. */
 	struct mtx		st_mtx;		/* Protects struct seltd */
 	struct cv		st_wait;	/* (t) Wait channel. */
 	int			st_flags;	/* (t) SELTD_ flags. */
 };
 
 #define	SELTD_PENDING	0x0001			/* We have pending events. */
 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
 
 /*
  * One selfd allocated per-thread per-file-descriptor.
  *	f - protected by sf_mtx
  */
 struct selfd {
 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
 	struct seltd		*sf_td;		/* (k) owning seltd. */
 	void			*sf_cookie;	/* (k) fd or pollfd. */
 	u_int			sf_refs;
 };
 
 static uma_zone_t selfd_zone;
 static struct mtx_pool *mtxpool_select;
 
 #ifdef __LP64__
 size_t
 devfs_iosize_max(void)
 {
 
 	return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 
 size_t
 iosize_max(void)
 {
 
 	return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_read(td, uap)
 	struct thread *td;
 	struct read_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_readv(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
  * Positioned read system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pread(td, uap)
 	struct thread *td;
 	struct pread_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pread(td, uap)
 	struct thread *td;
 	struct freebsd6_pread_args *uap;
 {
 	struct pread_args oargs;
 
 	oargs.fd = uap->fd;
 	oargs.buf = uap->buf;
 	oargs.nbyte = uap->nbyte;
 	oargs.offset = uap->offset;
 	return (sys_pread(td, &oargs));
 }
 #endif
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_readv(struct thread *td, struct readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Scatter positioned read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct preadv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_preadv(struct thread *td, struct preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_preadv(td, fd, auio, offset)
 	struct thread *td;
 	int fd;
 	struct uio *auio;
 	off_t offset;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for readv and preadv that reads data in
  * from a file using the passed in uio, offset, and flags.
  */
 static int
 dofileread(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
+	AUDIT_ARG_FD(fd);
+
 	/* Finish zero length reads right here */
 	if (auio->uio_resid == 0) {
 		td->td_retval[0] = 0;
 		return(0);
 	}
 	auio->uio_rw = UIO_READ;
 	auio->uio_offset = offset;
 	auio->uio_td = td;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) 
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_READ, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_write(td, uap)
 	struct thread *td;
 	struct write_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_writev(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
  * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pwrite(td, uap)
 	struct thread *td;
 	struct pwrite_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pwrite(td, uap)
 	struct thread *td;
 	struct freebsd6_pwrite_args *uap;
 {
 	struct pwrite_args oargs;
 
 	oargs.fd = uap->fd;
 	oargs.buf = uap->buf;
 	oargs.nbyte = uap->nbyte;
 	oargs.offset = uap->offset;
 	return (sys_pwrite(td, &oargs));
 }
 #endif
 
 /*
  * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_writev(struct thread *td, struct writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_pwritev(struct thread *td, struct pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_pwritev(td, fd, auio, offset)
 	struct thread *td;
 	struct uio *auio;
 	int fd;
 	off_t offset;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for writev and pwritev that writes data to
  * a file using the passed in uio, offset, and flags.
  */
 static int
 dofilewrite(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
+	AUDIT_ARG_FD(fd);
 	auio->uio_rw = UIO_WRITE;
 	auio->uio_td = td;
 	auio->uio_offset = offset;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if (fp->f_type == DTYPE_VNODE &&
 	    (fp->f_vnread_flags & FDEVFS_VNODE) == 0)
 		bwillwrite();
 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Socket layer is responsible for issuing SIGPIPE. */
 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_WRITE, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  *
  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
  * descriptor isn't writable.
  */
 int
 kern_ftruncate(td, fd, length)
 	struct thread *td;
 	int fd;
 	off_t length;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
 		return (EINVAL);
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if (!(fp->f_flag & FWRITE)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = fo_truncate(fp, length, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_ftruncate(td, uap)
 	struct thread *td;
 	struct ftruncate_args *uap;
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 int
 oftruncate(td, uap)
 	struct thread *td;
 	struct oftruncate_args *uap;
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ioctl(struct thread *td, struct ioctl_args *uap)
 {
 	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
 	u_long com;
 	int arg, error;
 	u_int size;
 	caddr_t data;
 
 	if (uap->com > 0xffffffff) {
 		printf(
 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
 		    td->td_proc->p_pid, td->td_name, uap->com);
 		uap->com &= 0xffffffff;
 	}
 	com = uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if ((size > IOCPARM_MAX) ||
 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	    ((com & IOC_OUT) && size == 0) ||
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
 
 	if (size > 0) {
 		if (com & IOC_VOID) {
 			/* Integer argument. */
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
 		} else {
 			if (size > SYS_IOCTL_SMALL_SIZE)
 				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 			else
 				data = smalldata;
 		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error != 0)
 			goto out;
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	}
 
 	error = kern_ioctl(td, uap->fd, com, data);
 
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
 out:
 	if (size > SYS_IOCTL_SMALL_SIZE)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
 
 int
 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 #ifndef CAPABILITIES
 	cap_rights_t rights;
 #endif
 	int error, tmp, locked;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(com);
 
 	fdp = td->td_proc->p_fd;
 
 	switch (com) {
 	case FIONCLEX:
 	case FIOCLEX:
 		FILEDESC_XLOCK(fdp);
 		locked = LA_XLOCKED;
 		break;
 	default:
 #ifdef CAPABILITIES
 		FILEDESC_SLOCK(fdp);
 		locked = LA_SLOCKED;
 #else
 		locked = LA_UNLOCKED;
 #endif
 		break;
 	}
 
 #ifdef CAPABILITIES
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
 		fp = NULL;	/* fhold() was not called yet */
 		goto out;
 	}
 	fhold(fp);
 	if (locked == LA_SLOCKED) {
 		FILEDESC_SUNLOCK(fdp);
 		locked = LA_UNLOCKED;
 	}
 #else
 	error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
 	if (error != 0) {
 		fp = NULL;
 		goto out;
 	}
 #endif
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	switch (com) {
 	case FIONCLEX:
 		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
 		goto out;
 	case FIOCLEX:
 		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
 		goto out;
 	case FIONBIO:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FASYNC);
 		else
 			atomic_clear_int(&fp->f_flag, FASYNC);
 		data = (void *)&tmp;
 		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	switch (locked) {
 	case LA_XLOCKED:
 		FILEDESC_XUNLOCK(fdp);
 		break;
 #ifdef CAPABILITIES
 	case LA_SLOCKED:
 		FILEDESC_SUNLOCK(fdp);
 		break;
 #endif
 	default:
 		FILEDESC_UNLOCK_ASSERT(fdp);
 		break;
 	}
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 poll_no_poll(int events)
 {
 	/*
 	 * Return true for read/write.  If the user asked for something
 	 * special, return POLLNVAL, so that clients have a way of
 	 * determining reliably whether or not the extended
 	 * functionality is present without hard-coding knowledge
 	 * of specific filesystem implementations.
 	 */
 	if (events & ~POLLSTANDARD)
 		return (POLLNVAL);
 
 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 int
 sys_pselect(struct thread *td, struct pselect_args *uap)
 {
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error != 0)
 		    return (error);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, NFDBITS));
 }
 
 int
 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
     struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
 {
 	int error;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error != 0)
 			return (error);
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
 int
 sys_select(struct thread *td, struct select_args *uap)
 {
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv, sizeof(tv));
 		if (error)
 			return (error);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    NFDBITS));
 }
 
 /*
  * In the unlikely case when user specified n greater then the last
  * open file descriptor, check that no bits are set after the last
  * valid fd.  We must return EBADF if any is set.
  *
  * There are applications that rely on the behaviour.
  *
  * nd is fd_lastfile + 1.
  */
 static int
 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
 {
 	char *addr, *oaddr;
 	int b, i, res;
 	uint8_t bits;
 
 	if (nd >= ndu || fd_in == NULL)
 		return (0);
 
 	oaddr = NULL;
 	bits = 0; /* silence gcc */
 	for (i = nd; i < ndu; i++) {
 		b = i / NBBY;
 #if BYTE_ORDER == LITTLE_ENDIAN
 		addr = (char *)fd_in + b;
 #else
 		addr = (char *)fd_in;
 		if (abi_nfdbits == NFDBITS) {
 			addr += rounddown(b, sizeof(fd_mask)) +
 			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
 		} else {
 			addr += rounddown(b, sizeof(uint32_t)) +
 			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
 		}
 #endif
 		if (addr != oaddr) {
 			res = fubyte(addr);
 			if (res == -1)
 				return (EFAULT);
 			oaddr = addr;
 			bits = res;
 		}
 		if ((bits & (1 << (i % NBBY))) != 0)
 			return (EBADF);
 	}
 	return (0);
 }
 
 int
 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
     fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
 {
 	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
 	int error, lf, ndu;
 
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	ndu = nd;
 	lf = fdp->fd_lastfile;
 	if (nd > lf + 1)
 		nd = lf + 1;
 
 	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
 	nbufbytes = 0;
 	if (fd_in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 #define	getbits(name, x) \
 	do {								\
 		if (name == NULL) {					\
 			ibits[x] = NULL;				\
 			obits[x] = NULL;				\
 		} else {						\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(name, ibits[x], ncpubytes);	\
 			if (error != 0)					\
 				goto done;				\
 			bzero((char *)ibits[x] + ncpubytes,		\
 			    ncpbytes - ncpubytes);			\
 		}							\
 	} while (0)
 	getbits(fd_in, 0);
 	getbits(fd_ou, 1);
 	getbits(fd_ex, 2);
 #undef	getbits
 
 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
 	/*
 	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
 	 * we are running under 32-bit emulation. This should be more
 	 * generic.
 	 */
 #define swizzle_fdset(bits)						\
 	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
 		int i;							\
 		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
 			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
 	}
 #else
 #define swizzle_fdset(bits)
 #endif
 
 	/* Make sure the bit order makes it through an ABI transition */
 	swizzle_fdset(ibits[0]);
 	swizzle_fdset(ibits[1]);
 	swizzle_fdset(ibits[2]);
 	
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	precision = 0;
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
 		    rtv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 done:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 
 	/* swizzle bit order back, if necessary */
 	swizzle_fdset(obits[0]);
 	swizzle_fdset(obits[1]);
 	swizzle_fdset(obits[2]);
 #undef swizzle_fdset
 
 #define	putbits(name, x) \
 	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(fd_in, 0);
 		putbits(fd_ou, 1);
 		putbits(fd_ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 
 	return (error);
 }
 /* 
  * Convert a select bit set to poll flags.
  *
  * The backend always returns POLLHUP/POLLERR if appropriate and we
  * return this as a set bit in any set.
  */
 static int select_flags[3] = {
     POLLRDNORM | POLLHUP | POLLERR,
     POLLWRNORM | POLLHUP | POLLERR,
     POLLRDBAND | POLLERR
 };
 
 /*
  * Compute the fo_poll flags required for a fd given by the index and
  * bit position in the fd_mask array.
  */
 static __inline int
 selflags(fd_mask **ibits, int idx, fd_mask bit)
 {
 	int flags;
 	int msk;
 
 	flags = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		flags |= select_flags[msk];
 	}
 	return (flags);
 }
 
 /*
  * Set the appropriate output bits given a mask of fired events and the
  * input bits originally requested.
  */
 static __inline int
 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
 {
 	int msk;
 	int n;
 
 	n = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if ((events & select_flags[msk]) == 0)
 			continue;
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		/*
 		 * XXX Check for a duplicate set.  This can occur because a
 		 * socket calls selrecord() twice for each poll() call
 		 * resulting in two selfds per real fd.  selrescan() will
 		 * call selsetbits twice as a result.
 		 */
 		if ((obits[msk][idx] & bit) != 0)
 			continue;
 		obits[msk][idx] |= bit;
 		n++;
 	}
 
 	return (n);
 }
 
 static __inline int
 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
 {
 	cap_rights_t rights;
 
 	cap_rights_init(&rights, CAP_EVENT);
 
 	return (fget_unlocked(fdp, fd, &rights, fpp, NULL));
 }
 
 /*
  * Traverse the list of fds attached to this thread's seltd and check for
  * completion.
  */
 static int
 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
 {
 	struct filedesc *fdp;
 	struct selinfo *si;
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct file *fp;
 	fd_mask bit;
 	int fd, ev, n, idx;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	n = 0;
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (int)(uintptr_t)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		error = getselfd_cap(fdp, fd, &fp);
 		if (error)
 			return (error);
 		idx = fd / NFDBITS;
 		bit = (fd_mask)1 << (fd % NFDBITS);
 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
 		fdrop(fp, td);
 		if (ev != 0)
 			n += selsetbits(ibits, obits, idx, bit, ev);
 	}
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * Perform the initial filedescriptor scan and register ourselves with
  * each selinfo.
  */
 static int
 selscan(td, ibits, obits, nfd)
 	struct thread *td;
 	fd_mask **ibits, **obits;
 	int nfd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	fd_mask bit;
 	int ev, flags, end, fd;
 	int n, idx;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	n = 0;
 	for (idx = 0, fd = 0; fd < nfd; idx++) {
 		end = imin(fd + NFDBITS, nfd);
 		for (bit = 1; fd < end; bit <<= 1, fd++) {
 			/* Compute the list of events we're interested in. */
 			flags = selflags(ibits, idx, bit);
 			if (flags == 0)
 				continue;
 			error = getselfd_cap(fdp, fd, &fp);
 			if (error)
 				return (error);
 			selfdalloc(td, (void *)(uintptr_t)fd);
 			ev = fo_poll(fp, flags, td->td_ucred, td);
 			fdrop(fp, td);
 			if (ev != 0)
 				n += selsetbits(ibits, obits, idx, bit, ev);
 		}
 	}
 
 	td->td_retval[0] = n;
 	return (0);
 }
 
 int
 sys_poll(struct thread *td, struct poll_args *uap)
 {
 	struct timespec ts, *tsp;
 
 	if (uap->timeout != INFTIM) {
 		if (uap->timeout < 0)
 			return (EINVAL);
 		ts.tv_sec = uap->timeout / 1000;
 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
 }
 
 int
 kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
     struct timespec *tsp, sigset_t *uset)
 {
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
 	sbintime_t sbt, precision, tmp;
 	time_t over;
 	struct timespec ts;
 	int error;
 	size_t ni;
 
 	precision = 0;
 	if (tsp != NULL) {
 		if (tsp->tv_sec < 0)
 			return (EINVAL);
 		if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000)
 			return (EINVAL);
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			sbt = 0;
 		else {
 			ts = *tsp;
 			if (ts.tv_sec > INT32_MAX / 2) {
 				over = ts.tv_sec - INT32_MAX / 2;
 				ts.tv_sec -= over;
 			} else
 				over = 0;
 			tmp = tstosbt(ts);
 			precision = tmp;
 			precision >>= tc_precexp;
 			if (TIMESEL(&sbt, tmp))
 				sbt += tc_tick_sbt;
 			sbt += tmp;
 		}
 	} else
 		sbt = -1;
 
 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 
 		return (EINVAL);
 	ni = nfds * sizeof(struct pollfd);
 	if (ni > sizeof(smallbits))
 		bits = malloc(ni, M_TEMP, M_WAITOK);
 	else
 		bits = smallbits;
 	error = copyin(fds, bits, ni);
 	if (error)
 		goto done;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error)
 			goto done;
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = pollscan(td, bits, nfds);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, sbt, precision);
 		if (error)
 			break;
 		error = pollrescan(td);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 done:
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	if (error == 0) {
 		error = pollout(td, bits, fds, nfds);
 		if (error)
 			goto out;
 	}
 out:
 	if (ni > sizeof(smallbits))
 		free(bits, M_TEMP);
 	return (error);
 }
 
 int
 sys_ppoll(struct thread *td, struct ppoll_args *uap)
 {
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 	/*
 	 * fds is still a pointer to user space. kern_poll() will
 	 * take care of copyin that array to the kernel space.
 	 */
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
 
 static int
 pollrescan(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct selinfo *si;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct pollfd *fd;
 #ifdef CAPABILITIES
 	cap_rights_t rights;
 #endif
 	int n;
 
 	n = 0;
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	FILEDESC_SLOCK(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (struct pollfd *)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		fp = fdp->fd_ofiles[fd->fd].fde_file;
 #ifdef CAPABILITIES
 		if (fp == NULL ||
 		    cap_check(cap_rights(fdp, fd->fd),
 		    cap_rights_init(&rights, CAP_EVENT)) != 0)
 #else
 		if (fp == NULL)
 #endif
 		{
 			fd->revents = POLLNVAL;
 			n++;
 			continue;
 		}
 
 		/*
 		 * Note: backend also returns POLLHUP and
 		 * POLLERR if appropriate.
 		 */
 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
 		if (fd->revents != 0)
 			n++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 
 static int
 pollout(td, fds, ufds, nfd)
 	struct thread *td;
 	struct pollfd *fds;
 	struct pollfd *ufds;
 	u_int nfd;
 {
 	int error = 0;
 	u_int i = 0;
 	u_int n = 0;
 
 	for (i = 0; i < nfd; i++) {
 		error = copyout(&fds->revents, &ufds->revents,
 		    sizeof(ufds->revents));
 		if (error)
 			return (error);
 		if (fds->revents != 0)
 			n++;
 		fds++;
 		ufds++;
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 static int
 pollscan(td, fds, nfd)
 	struct thread *td;
 	struct pollfd *fds;
 	u_int nfd;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp;
 #ifdef CAPABILITIES
 	cap_rights_t rights;
 #endif
 	int i, n = 0;
 
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd > fdp->fd_lastfile) {
 			fds->revents = POLLNVAL;
 			n++;
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
 		} else {
 			fp = fdp->fd_ofiles[fds->fd].fde_file;
 #ifdef CAPABILITIES
 			if (fp == NULL ||
 			    cap_check(cap_rights(fdp, fds->fd),
 			    cap_rights_init(&rights, CAP_EVENT)) != 0)
 #else
 			if (fp == NULL)
 #endif
 			{
 				fds->revents = POLLNVAL;
 				n++;
 			} else {
 				/*
 				 * Note: backend also returns POLLHUP and
 				 * POLLERR if appropriate.
 				 */
 				selfdalloc(td, fds);
 				fds->revents = fo_poll(fp, fds->events,
 				    td->td_ucred, td);
 				/*
 				 * POSIX requires POLLOUT to be never
 				 * set simultaneously with POLLHUP.
 				 */
 				if ((fds->revents & POLLHUP) != 0)
 					fds->revents &= ~POLLOUT;
 
 				if (fds->revents != 0)
 					n++;
 			}
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * OpenBSD poll system call.
  *
  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct openbsd_poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
 int
 sys_openbsd_poll(td, uap)
 	register struct thread *td;
 	register struct openbsd_poll_args *uap;
 {
 	return (sys_poll(td, (struct poll_args *)uap));
 }
 
 /*
  * XXX This was created specifically to support netncp and netsmb.  This
  * allows the caller to specify a socket to wait for events on.  It returns
  * 0 if any events matched and an error otherwise.  There is no way to
  * determine which events fired.
  */
 int
 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 {
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	int error;
 
 	precision = 0;	/* stupid gcc! */
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
 		    rtv.tv_usec >= 1000000)
 			return (EINVAL);
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/*
 	 * Iterate until the timeout expires or the socket becomes ready.
 	 */
 	for (;;) {
 		selfdalloc(td, NULL);
 		error = sopoll(so, events, NULL, td);
 		/* error here is actually the ready events. */
 		if (error)
 			return (0);
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 	}
 	seltdclear(td);
 	/* XXX Duplicates ncp/smb behavior. */
 	if (error == ERESTART)
 		error = 0;
 	return (error);
 }
 
 /*
  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
  * have two select sets, one for read and another for write.
  */
 static void
 selfdalloc(struct thread *td, void *cookie)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp->st_free1 == NULL)
 		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
 	stp->st_free1->sf_td = stp;
 	stp->st_free1->sf_cookie = cookie;
 	if (stp->st_free2 == NULL)
 		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
 	stp->st_free2->sf_td = stp;
 	stp->st_free2->sf_cookie = cookie;
 }
 
 static void
 selfdfree(struct seltd *stp, struct selfd *sfp)
 {
 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
 	if (sfp->sf_si != NULL) {
 		mtx_lock(sfp->sf_mtx);
 		if (sfp->sf_si != NULL) {
 			TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
 			refcount_release(&sfp->sf_refs);
 		}
 		mtx_unlock(sfp->sf_mtx);
 	}
 	if (refcount_release(&sfp->sf_refs))
 		uma_zfree(selfd_zone, sfp);
 }
 
 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */
 void
 seldrain(sip)
         struct selinfo *sip;
 {
 
 	/*
 	 * This feature is already provided by doselwakeup(), thus it is
 	 * enough to go for it.
 	 * Eventually, the context, should take care to avoid races
 	 * between thread calling select()/poll() and file descriptor
 	 * detaching, but, again, the races are just the same as
 	 * selwakeup().
 	 */
         doselwakeup(sip, -1);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(selector, sip)
 	struct thread *selector;
 	struct selinfo *sip;
 {
 	struct selfd *sfp;
 	struct seltd *stp;
 	struct mtx *mtxp;
 
 	stp = selector->td_sel;
 	/*
 	 * Don't record when doing a rescan.
 	 */
 	if (stp->st_flags & SELTD_RESCAN)
 		return;
 	/*
 	 * Grab one of the preallocated descriptors.
 	 */
 	sfp = NULL;
 	if ((sfp = stp->st_free1) != NULL)
 		stp->st_free1 = NULL;
 	else if ((sfp = stp->st_free2) != NULL)
 		stp->st_free2 = NULL;
 	else
 		panic("selrecord: No free selfd on selq");
 	mtxp = sip->si_mtx;
 	if (mtxp == NULL)
 		mtxp = mtx_pool_find(mtxpool_select, sip);
 	/*
 	 * Initialize the sfp and queue it in the thread.
 	 */
 	sfp->sf_si = sip;
 	sfp->sf_mtx = mtxp;
 	refcount_init(&sfp->sf_refs, 2);
 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
 	/*
 	 * Now that we've locked the sip, check for initialization.
 	 */
 	mtx_lock(mtxp);
 	if (sip->si_mtx == NULL) {
 		sip->si_mtx = mtxp;
 		TAILQ_INIT(&sip->si_tdlist);
 	}
 	/*
 	 * Add this thread to the list of selfds listening on this selinfo.
 	 */
 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
 	mtx_unlock(sip->si_mtx);
 }
 
 /* Wake up a selecting thread. */
 void
 selwakeup(sip)
 	struct selinfo *sip;
 {
 	doselwakeup(sip, -1);
 }
 
 /* Wake up a selecting thread, and set its priority. */
 void
 selwakeuppri(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	doselwakeup(sip, pri);
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 static void
 doselwakeup(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct seltd *stp;
 
 	/* If it's not initialized there can't be any waiters. */
 	if (sip->si_mtx == NULL)
 		return;
 	/*
 	 * Locking the selinfo locks all selfds associated with it.
 	 */
 	mtx_lock(sip->si_mtx);
 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
 		/*
 		 * Once we remove this sfp from the list and clear the
 		 * sf_si seltdclear will know to ignore this si.
 		 */
 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
 		sfp->sf_si = NULL;
 		stp = sfp->sf_td;
 		mtx_lock(&stp->st_mtx);
 		stp->st_flags |= SELTD_PENDING;
 		cv_broadcastpri(&stp->st_wait, pri);
 		mtx_unlock(&stp->st_mtx);
 		if (refcount_release(&sfp->sf_refs))
 			uma_zfree(selfd_zone, sfp);
 	}
 	mtx_unlock(sip->si_mtx);
 }
 
 static void
 seltdinit(struct thread *td)
 {
 	struct seltd *stp;
 
 	if ((stp = td->td_sel) != NULL)
 		goto out;
 	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
 	cv_init(&stp->st_wait, "select");
 out:
 	stp->st_flags = 0;
 	STAILQ_INIT(&stp->st_selq);
 }
 
 static int
 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
 {
 	struct seltd *stp;
 	int error;
 
 	stp = td->td_sel;
 	/*
 	 * An event of interest may occur while we do not hold the seltd
 	 * locked so check the pending flag before we sleep.
 	 */
 	mtx_lock(&stp->st_mtx);
 	/*
 	 * Any further calls to selrecord will be a rescan.
 	 */
 	stp->st_flags |= SELTD_RESCAN;
 	if (stp->st_flags & SELTD_PENDING) {
 		mtx_unlock(&stp->st_mtx);
 		return (0);
 	}
 	if (sbt == 0)
 		error = EWOULDBLOCK;
 	else if (sbt != -1)
 		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
 		    sbt, precision, C_ABSOLUTE);
 	else
 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
 	mtx_unlock(&stp->st_mtx);
 
 	return (error);
 }
 
 void
 seltdfini(struct thread *td)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp == NULL)
 		return;
 	if (stp->st_free1)
 		uma_zfree(selfd_zone, stp->st_free1);
 	if (stp->st_free2)
 		uma_zfree(selfd_zone, stp->st_free2);
 	td->td_sel = NULL;
 	free(stp, M_SELECT);
 }
 
 /*
  * Remove the references to the thread from all of the objects we were
  * polling.
  */
 static void
 seltdclear(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 
 	stp = td->td_sel;
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
 		selfdfree(stp, sfp);
 	stp->st_flags = 0;
 }
 
 static void selectinit(void *);
 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
 static void
 selectinit(void *dummy __unused)
 {
 
 	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
 }
 
 /*
  * Set up a syscall return value that follows the convention specified for
  * posix_* functions.
  */
 int
 kern_posix_error(struct thread *td, int error)
 {
 
 	if (error <= 0)
 		return (error);
 	td->td_errno = error;
 	td->td_pflags |= TDP_NERRNO;
 	td->td_retval[0] = error;
 	return (0);
 }
Index: stable/11/sys/kern/vfs_aio.c
===================================================================
--- stable/11/sys/kern/vfs_aio.c	(revision 314333)
+++ stable/11/sys/kern/vfs_aio.c	(revision 314334)
@@ -1,2992 +1,2990 @@
 /*-
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process (stored in the process)");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process (stored in the process)");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qphysio() complete in GEOM and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
 	int	kaio_buffer_count;	/* (a) number of physio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
 static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io process data
  *	aiocb	async io jobs
  *	aiol	list io job pointer - internal to aio_suspend XXX
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
 static struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
 	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-physio version of the operations.  The
  * normal vn operations are used, and this code should work in all instances
  * for every type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		if (auio.uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL)
 		error = aio_fsync_vnode(td, fp->f_vnode);
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
-	error = vm_mlock(job->userproc, job->cred,
-	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
-	if (error)
-		aio_complete(job, -1, error);
-	else
-		aio_complete(job, 0, 0);
+	error = kern_mlock(job->userproc, job->cred,
+	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
+	aio_complete(job, error != 0 ? -1 : 0, error);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
 			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	ki = job->userproc->p_aioinfo;
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
 static bool
 aio_clear_cancel_function_locked(struct kaiocb *job)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
 		return (false);
 	}
 	job->cancel_fn = NULL;
 	return (true);
 }
 
 bool
 aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_clear_cancel_function_locked(job);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 static bool
 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
 	job->cancel_fn = func;
 	return (true);
 }
 
 bool
 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_set_cancel_function_locked(job, func);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	uma_zfree(aiop_zone, aiop);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
 	KASSERT(myvm->vm_refcnt > 1,
 	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead physio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct bio *bp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	int error, ref, poff;
 	vm_prot_t prot;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
 	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
 		if (cb->aio_nbytes > MAXPHYS) {
 			error = -1;
 			goto unref;
 		}
 
 		pbuf = NULL;
 	} else {
 		if (cb->aio_nbytes > MAXPHYS - poff) {
 			error = -1;
 			goto unref;
 		}
 		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
 			error = -1;
 			goto unref;
 		}
 
 		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
 		BUF_KERNPROC(pbuf);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count++;
 		AIO_UNLOCK(ki);
 	}
 	job->bp = bp = g_alloc_bio();
 
 	bp->bio_length = cb->aio_nbytes;
 	bp->bio_bcount = cb->aio_nbytes;
 	bp->bio_done = aio_physwakeup;
 	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
 	bp->bio_offset = cb->aio_offset;
 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 	bp->bio_dev = dev;
 	bp->bio_caller1 = (void *)job;
 
 	prot = VM_PROT_READ;
 	if (cb->aio_lio_opcode == LIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
 	    nitems(job->pages));
 	if (job->npages < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 	if (pbuf != NULL) {
 		pmap_qenter((vm_offset_t)pbuf->b_data,
 		    job->pages, job->npages);
 		bp->bio_data = pbuf->b_data + poff;
 		atomic_add_int(&num_buf_aio, 1);
 	} else {
 		bp->bio_ma = job->pages;
 		bp->bio_ma_n = job->npages;
 		bp->bio_ma_offset = poff;
 		bp->bio_data = unmapped_buf;
 		bp->bio_flags |= BIO_UNMAPPED;
 	}
 
 	/* Perform transfer. */
 	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	if (pbuf != NULL) {
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 		relpbuf(pbuf, NULL);
 		job->pbuf = NULL;
 	}
 	g_destroy_bio(bp);
 	job->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb *ojob;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	ojob = (struct oaiocb *)kjob;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 
 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	cap_rights_t rights;
 	struct file *fp;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= ki->kaio_qallowed_count) {
 		ops->store_error(ujob, EAGAIN);
 		return (EAGAIN);
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->copyin(ujob, &job->uaiocb);
 	if (error) {
 		ops->store_error(ujob, error);
 		uma_zfree(aiocb_zone, job);
 		return (error);
 	}
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		ops->store_error(ujob, EINVAL);
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		job->uaiocb.aio_lio_opcode = type;
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd,
 		    cap_rights_init(&rights, CAP_PWRITE), &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd,
 		    cap_rights_init(&rights, CAP_PREAD), &fp);
 		break;
 	case LIO_SYNC:
 		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
 		break;
 	case LIO_MLOCK:
 		fp = NULL;
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, cap_rights_init(&rights), &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, job);
 		ops->store_error(ujob, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (opcode != LIO_SYNC && job->uaiocb.aio_offset == -1LL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, 1);
 	if (error)
 		goto aqueue_fail;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto aqueue_fail;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 aqueue_fail:
 	knlist_delete(&job->klist, curthread, 0);
 	if (fp)
 		fdrop(fp, td);
 	uma_zfree(aiocb_zone, job);
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
 	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, opcode;
 	bool safe;
 
 	lj = job->lio;
 	ki = job->userproc->p_aioinfo;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
 	if ((error = aio_qphysio(job->userproc, job)) == 0)
 		goto done;
 #if 0
 	/*
 	 * XXX: This means qphysio() failed with EFAULT.  The current
 	 * behavior is to retry the operation via fo_read/fo_write.
 	 * Wouldn't it be better to just complete the request with an
 	 * error here?
 	 */
 	if (error > 0)
 		goto done;
 #endif
 queueit:
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	if (opcode == LIO_SYNC) {
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
 			if (!aio_set_cancel_function_locked(job,
 				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 	}
 
 	switch (opcode) {
 	case LIO_READ:
 	case LIO_WRITE:
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 		break;
 	case LIO_SYNC:
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 	}
 done:
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-physio aio operations not currently in
  * progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp, &error)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_physwakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	size_t nbytes;
 	int error, nblks;
 
 	/* Release mapping into kernel space. */
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 	if (job->pbuf) {
 		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
 		relpbuf(job->pbuf, NULL);
 		job->pbuf = NULL;
 		atomic_subtract_int(&num_buf_aio, 1);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	}
 	vm_page_unhold_pages(job->pages, job->npages);
 
 	bp = job->bp;
 	job->bp = NULL;
 	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
 	error = 0;
 	if (bp->bio_flags & BIO_ERROR)
 		error = bp->bio_error;
 	nblks = btodb(nbytes);
 	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
 		job->outblock += nblks;
 	else
 		job->inblock += nblks;
 
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, nbytes, 0);
 
 	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job = (struct kaiocb *)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb32 job32;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct aiocb32 job32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent; i > 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
Index: stable/11/sys/sys/syscallsubr.h
===================================================================
--- stable/11/sys/sys/syscallsubr.h	(revision 314333)
+++ stable/11/sys/sys/syscallsubr.h	(revision 314334)
@@ -1,266 +1,275 @@
 /*-
  * Copyright (c) 2002 Ian Dowse.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SYSCALLSUBR_H_
 #define _SYS_SYSCALLSUBR_H_
 
 #include <sys/signal.h>
 #include <sys/uio.h>
 #include <sys/socket.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 
 struct file;
 struct filecaps;
 enum idtype;
 struct itimerval;
 struct image_args;
 struct jail;
 struct kevent;
 struct kevent_copyops;
 struct kld_file_stat;
 struct ksiginfo;
 struct mbuf;
 struct msghdr;
 struct msqid_ds;
 struct pollfd;
 struct ogetdirentries_args;
 struct rlimit;
 struct rusage;
 union semun;
 struct sendfile_args;
 struct sockaddr;
 struct stat;
 struct thr_param;
 struct sched_param;
 struct __wrusage;
 
 int	kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg,
 	    u_int buflen, u_int path_max);
 int	kern_accept(struct thread *td, int s, struct sockaddr **name,
 	    socklen_t *namelen, struct file **fp);
 int	kern_accept4(struct thread *td, int s, struct sockaddr **name,
 	    socklen_t *namelen, int flags, struct file **fp);
 int	kern_accessat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int flags, int mode);
 int	kern_adjtime(struct thread *td, struct timeval *delta,
 	    struct timeval *olddelta);
 int	kern_alternate_path(struct thread *td, const char *prefix, const char *path,
 	    enum uio_seg pathseg, char **pathbuf, int create, int dirfd);
 int	kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa);
 int	kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds,
 	    size_t ncmds);
 int	kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights);
 int	kern_chdir(struct thread *td, char *path, enum uio_seg pathseg);
 int	kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
 	    clockid_t *clk_id);
 int	kern_clock_getres(struct thread *td, clockid_t clock_id,
 	    struct timespec *ts);
 int	kern_clock_gettime(struct thread *td, clockid_t clock_id,
 	    struct timespec *ats);
 int	kern_clock_settime(struct thread *td, clockid_t clock_id,
 	    struct timespec *ats);
 int	kern_close(struct thread *td, int fd);
 int	kern_connectat(struct thread *td, int dirfd, int fd,
 	    struct sockaddr *sa);
 int	kern_dup(struct thread *td, u_int mode, int flags, int old, int new);
 int	kern_execve(struct thread *td, struct image_args *args,
 	    struct mac *mac_p);
 int	kern_fchmodat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, mode_t mode, int flag);
 int	kern_fchownat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int uid, int gid, int flag);
 int	kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg);
 int	kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg);
 int	kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf);
 int	kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf);
 int	kern_fstat(struct thread *td, int fd, struct stat *sbp);
 int	kern_fstatfs(struct thread *td, int fd, struct statfs *buf);
 int	kern_fsync(struct thread *td, int fd, bool fullsync);
 int	kern_ftruncate(struct thread *td, int fd, off_t length);
 int	kern_futimes(struct thread *td, int fd, struct timeval *tptr,
 	    enum uio_seg tptrseg);
 int	kern_futimens(struct thread *td, int fd, struct timespec *tptr,
 	    enum uio_seg tptrseg);
 int	kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
 	    long *basep, ssize_t *residp, enum uio_seg bufseg);
 int	kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
 	    size_t *countp, enum uio_seg bufseg, int mode);
 int	kern_getitimer(struct thread *, u_int, struct itimerval *);
 int	kern_getppid(struct thread *);
 int	kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
 	    socklen_t *alen);
 int	kern_getrusage(struct thread *td, int who, struct rusage *rup);
 int	kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
 	    socklen_t *alen);
 int	kern_getsockopt(struct thread *td, int s, int level, int name,
 	    void *optval, enum uio_seg valseg, socklen_t *valsize);
 int	kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data);
 int	kern_jail(struct thread *td, struct jail *j);
 int	kern_jail_get(struct thread *td, struct uio *options, int flags);
 int	kern_jail_set(struct thread *td, struct uio *options, int flags);
 int	kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
 	    struct kevent_copyops *k_ops, const struct timespec *timeout);
 int	kern_kevent_anonymous(struct thread *td, int nevents,
 	    struct kevent_copyops *k_ops);
 int	kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
 	    int nevents, struct kevent_copyops *k_ops,
 	    const struct timespec *timeout);
 int	kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
 int	kern_kldload(struct thread *td, const char *file, int *fileid);
 int	kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
 int	kern_kldunload(struct thread *td, int fileid, int flags);
 int	kern_linkat(struct thread *td, int fd1, int fd2, char *path1,
 	    char *path2, enum uio_seg segflg, int follow);
 int	kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
 	    struct timeval *tptr, enum uio_seg tptrseg);
+int	kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav);
 int	kern_mkdirat(struct thread *td, int fd, char *path,
 	    enum uio_seg segflg, int mode);
 int	kern_mkfifoat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int mode);
 int	kern_mknodat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int mode, int dev);
+int	kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr,
+	    size_t len);
+int	kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot,
+	    int flags, int fd, off_t pos);
+int	kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
 int	kern_msgctl(struct thread *, int, int, struct msqid_ds *);
 int	kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
 int	kern_msgsnd(struct thread *, int, const void *, size_t, int, long);
+int	kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags);
+int	kern_munlock(struct thread *td, uintptr_t addr, size_t size);
+int	kern_munmap(struct thread *td, uintptr_t addr, size_t size);
 int     kern_nanosleep(struct thread *td, struct timespec *rqt,
 	    struct timespec *rmt);
 int	kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
 	    long *ploff);
 int	kern_openat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, int flags, int mode);
 int	kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg,
 	    int name, u_long flags);
 int	kern_pipe(struct thread *td, int fildes[2], int flags,
 	    struct filecaps *fcaps1, struct filecaps *fcaps2);
 int	kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
 	    struct timespec *tsp, sigset_t *uset);
 int	kern_posix_error(struct thread *td, int error);
 int	kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
 	    int advice);
 int	kern_posix_fallocate(struct thread *td, int fd, off_t offset,
 	    off_t len);
 int	kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com,
 	    void *data);
 int	kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset);
 int	kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
 	    fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
 int	kern_ptrace(struct thread *td, int req, pid_t pid, void *addr,
 	    int data);
 int	kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset);
 int	kern_readlinkat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count);
 int	kern_readv(struct thread *td, int fd, struct uio *auio);
 int	kern_recvit(struct thread *td, int s, struct msghdr *mp,
 	    enum uio_seg fromseg, struct mbuf **controlp);
 int	kern_renameat(struct thread *td, int oldfd, char *old, int newfd,
 	    char *new, enum uio_seg pathseg);
 int	kern_rmdirat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg);
 int	kern_sched_getparam(struct thread *td, struct thread *targettd,
 	    struct sched_param *param);
 int	kern_sched_getscheduler(struct thread *td, struct thread *targettd,
 	    int *policy);
 int	kern_sched_setparam(struct thread *td, struct thread *targettd,
 	    struct sched_param *param);
 int	kern_sched_setscheduler(struct thread *td, struct thread *targettd,
 	    int policy, struct sched_param *param);
 int	kern_sched_rr_get_interval(struct thread *td, pid_t pid,
 	    struct timespec *ts);
 int	kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd,
 	    struct timespec *ts);
 int	kern_semctl(struct thread *td, int semid, int semnum, int cmd,
 	    union semun *arg, register_t *rval);
 int	kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 	    fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits);
 int	kern_sendfile(struct thread *td, struct sendfile_args *uap,
 	    struct uio *hdr_uio, struct uio *trl_uio, int compat);
 int	kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
 	    struct mbuf *control, enum uio_seg segflg);
 int	kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups);
 int	kern_setitimer(struct thread *, u_int, struct itimerval *,
 	    struct itimerval *);
 int	kern_setrlimit(struct thread *, u_int, struct rlimit *);
 int	kern_setsockopt(struct thread *td, int s, int level, int name,
 	    void *optval, enum uio_seg valseg, socklen_t valsize);
 int	kern_settimeofday(struct thread *td, struct timeval *tv,
 	    struct timezone *tzp);
 int	kern_shm_open(struct thread *td, const char *userpath, int flags,
 	    mode_t mode, struct filecaps *fcaps);
 int	kern_shmat(struct thread *td, int shmid, const void *shmaddr,
 	    int shmflg);
 int	kern_shmctl(struct thread *td, int shmid, int cmd, void *buf,
 	    size_t *bufsz);
 int	kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
 	    struct sigaction *oact, int flags);
 int	kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss);
 int	kern_sigprocmask(struct thread *td, int how,
 	    sigset_t *set, sigset_t *oset, int flags);
 int	kern_sigsuspend(struct thread *td, sigset_t mask);
 int	kern_sigtimedwait(struct thread *td, sigset_t waitset,
 	    struct ksiginfo *ksi, struct timespec *timeout);
 int	kern_statat(struct thread *td, int flag, int fd, char *path,
 	    enum uio_seg pathseg, struct stat *sbp,
 	    void (*hook)(struct vnode *vp, struct stat *sbp));
 int	kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
 	    struct statfs *buf);
 int	kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
 	    enum uio_seg segflg);
 int	kern_ktimer_create(struct thread *td, clockid_t clock_id,
 	    struct sigevent *evp, int *timerid, int preset_id);
 int	kern_ktimer_delete(struct thread *, int);
 int	kern_ktimer_settime(struct thread *td, int timer_id, int flags,
 	    struct itimerspec *val, struct itimerspec *oval);
 int	kern_ktimer_gettime(struct thread *td, int timer_id,
 	    struct itimerspec *val);
 int	kern_ktimer_getoverrun(struct thread *td, int timer_id);
 int	kern_thr_alloc(struct proc *, int pages, struct thread **);
 int	kern_thr_exit(struct thread *td);
 int	kern_thr_new(struct thread *td, struct thr_param *param);
 int	kern_thr_suspend(struct thread *td, struct timespec *tsp);
 int	kern_truncate(struct thread *td, char *path, enum uio_seg pathseg,
 	    off_t length);
 int	kern_unlinkat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, ino_t oldinum);
 int	kern_utimesat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg);
 int	kern_utimensat(struct thread *td, int fd, char *path,
 	    enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
 	    int follow);
 int	kern_wait(struct thread *td, pid_t pid, int *status, int options,
 	    struct rusage *rup);
 int	kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status,
 	    int options, struct __wrusage *wrup, siginfo_t *sip);
 int	kern_writev(struct thread *td, int fd, struct uio *auio);
 int	kern_socketpair(struct thread *td, int domain, int type, int protocol,
 	    int *rsv);
 
 /* flags for kern_sigaction */
 #define	KSA_OSIGSET	0x0001	/* uses osigact_t */
 #define	KSA_FREEBSD4	0x0002	/* uses ucontext4 */
 
 #endif /* !_SYS_SYSCALLSUBR_H_ */
Index: stable/11/sys/vm/vm_extern.h
===================================================================
--- stable/11/sys/vm/vm_extern.h	(revision 314333)
+++ stable/11/sys/vm/vm_extern.h	(revision 314334)
@@ -1,119 +1,118 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
  * $FreeBSD$
  */
 
 #ifndef _VM_EXTERN_H_
 #define	_VM_EXTERN_H_
 
 struct pmap;
 struct proc;
 struct vmspace;
 struct vnode;
 struct vmem;
 
 #ifdef _KERNEL
 struct cdev;
 struct cdevsw;
 
 /* These operate on kernel virtual addresses only. */
 vm_offset_t kva_alloc(vm_size_t);
 void kva_free(vm_offset_t, vm_size_t);
 
 /* These operate on pageable virtual addresses. */
 vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t);
 void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
 
 /* These operate on virtual addresses backed by memory. */
 vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
 vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
     vm_memattr_t memattr);
 vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags);
 void kmem_free(struct vmem *, vm_offset_t, vm_size_t);
 
 /* This provides memory for previously allocated address space. */
 int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int);
 void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
 
 /* Bootstrapping. */
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
 void kmem_init(vm_offset_t, vm_offset_t);
 void kmem_init_zero_region(void);
 void kmeminit(void);
 
 void swapout_procs(int);
 int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
     vm_ooffset_t *);
 int vm_fault_disable_pagefaults(void);
 void vm_fault_enable_pagefaults(int save);
 int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
 int vm_forkproc(struct thread *, struct proc *, struct thread *,
     struct vmspace *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
     objtype_t, void *, vm_ooffset_t);
 int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
     vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
 int vm_mmap_to_errno(int rv);
 int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
 int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
     struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
 struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *);
 int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
 int vmspace_unshare(struct proc *);
 void vmspace_exit(struct thread *);
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vmspace_switch_aio(struct vmspace *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
 struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
-int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
Index: stable/11/sys/vm/vm_mmap.c
===================================================================
--- stable/11/sys/vm/vm_mmap.c	(revision 314333)
+++ stable/11/sys/vm/vm_mmap.c	(revision 314334)
@@ -1,1549 +1,1561 @@
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
+#include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 int old_mlock = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 int
 sys_sbrk(struct thread *td, struct sbrk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 int
 sys_sstk(struct thread *td, struct sstk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 int
 ogetpagesize(struct thread *td, struct getpagesize_args *uap)
 {
 
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 */
 
 
 /*
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 int
-sys_mmap(td, uap)
-	struct thread *td;
-	struct mmap_args *uap;
+sys_mmap(struct thread *td, struct mmap_args *uap)
 {
+
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
+}
+
+int
+kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags,
+    int fd, off_t pos)
+{
+	struct vmspace *vms;
 	struct file *fp;
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
+	vm_size_t pageoff;
 	vm_prot_t cap_maxprot;
-	int align, error, flags, prot;
-	off_t pos;
-	struct vmspace *vms = td->td_proc->p_vmspace;
+	int align, error;
 	cap_rights_t rights;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot;
-	flags = uap->flags;
-	pos = uap->pos;
-
+	vms = td->td_proc->p_vmspace;
 	fp = NULL;
+	AUDIT_ARG_FD(fd);
+	addr = addr0;
 
 	/*
 	 * Ignore old flags that used to be defined but did not do anything.
 	 */
 	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
 	
 	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
 	 * zero position for new code. Be nice to ancient a.out
 	 * binaries and correct pos for anonymous mapping, since old
 	 * ld.so sometimes issues anonymous map requests with non-zero
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
-		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
-		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
+		if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
+		    ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
 			pos = 0;
 	}
 
 	if (flags & MAP_STACK) {
-		if ((uap->fd != -1) ||
+		if ((fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
 	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
 	    MAP_PREFAULT_READ |
 #ifdef MAP_32BIT
 	    MAP_32BIT |
 #endif
 	    MAP_ALIGNMENT_MASK)) != 0)
 		return (EINVAL);
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
 	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
 		return (EINVAL);
 	if (prot != PROT_NONE &&
 	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Adjust size for rounding (on both ends). */
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
 	/* Ensure alignment is at least a page and fits in a pointer. */
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 
 		/* Address range must be all in user VM space. */
 		if (addr < vm_map_min(&vms->vm_map) ||
 		    addr + size > vm_map_max(&vms->vm_map))
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
 #ifdef MAP_32BIT
 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
 			return (EINVAL);
 	} else if (flags & MAP_32BIT) {
 		/*
 		 * For MAP_32BIT, override the hint if it is too high and
 		 * do not bother moving the mapping past the heap (since
 		 * the heap is usually above 2GB).
 		 */
 		if (addr + size > MAP_32BIT_MAX_ADDR)
 			addr = 0;
 #endif
 	} else {
 		/*
 		 * XXX for non-fixed mappings where no hint is provided or
 		 * the hint would fall in the potential heap space,
 		 * place it after the end of the largest possible heap.
 		 *
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
 			    lim_max(td, RLIMIT_DATA));
 	}
 	if (size == 0) {
 		/*
 		 * Return success without mapping anything for old
 		 * binaries that request a page-aligned mapping of
 		 * length 0.  For modern binaries, this function
 		 * returns an error earlier.
 		 */
 		error = 0;
 	} else if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 *
 		 * This relies on VM_PROT_* matching PROT_*.
 		 */
 		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
 		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
 		 * descriptor disappear on us if we block. Check capability
 		 * rights, but also return the maximum rights to be combined
 		 * with maxprot later.
 		 */
 		cap_rights_init(&rights, CAP_MMAP);
 		if (prot & PROT_READ)
 			cap_rights_set(&rights, CAP_MMAP_R);
 		if ((flags & MAP_SHARED) != 0) {
 			if (prot & PROT_WRITE)
 				cap_rights_set(&rights, CAP_MMAP_W);
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set(&rights, CAP_MMAP_X);
-		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+		error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
 		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
 		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
 			error = EINVAL;
 			goto done;
 		}
 
 		/* This relies on VM_PROT_* matching PROT_*. */
 		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
 		    cap_maxprot, flags, pos, td);
 	}
 
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
 	if (fp)
 		fdrop(fp, td);
 
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
-	struct mmap_args oargs;
 
-	oargs.addr = uap->addr;
-	oargs.len = uap->len;
-	oargs.prot = uap->prot;
-	oargs.flags = uap->flags;
-	oargs.fd = uap->fd;
-	oargs.pos = uap->pos;
-	return (sys_mmap(td, &oargs));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
 }
 #endif
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
-ommap(td, uap)
-	struct thread *td;
-	struct ommap_args *uap;
+ommap(struct thread *td, struct ommap_args *uap)
 {
-	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
+	int flags, prot;
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
-	nargs.addr = uap->addr;
-	nargs.len = uap->len;
-	nargs.prot = cvtbsdprot[uap->prot & 0x7];
+	prot = cvtbsdprot[uap->prot & 0x7];
 #ifdef COMPAT_FREEBSD32
 #if defined(__amd64__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
-	    nargs.prot != 0)
-		nargs.prot |= PROT_EXEC;
+	    prot != 0)
+		prot |= PROT_EXEC;
 #endif
 #endif
-	nargs.flags = 0;
+	flags = 0;
 	if (uap->flags & OMAP_ANON)
-		nargs.flags |= MAP_ANON;
+		flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
-		nargs.flags |= MAP_COPY;
+		flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
-		nargs.flags |= MAP_SHARED;
+		flags |= MAP_SHARED;
 	else
-		nargs.flags |= MAP_PRIVATE;
+		flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
-		nargs.flags |= MAP_FIXED;
-	nargs.fd = uap->fd;
-	nargs.pos = uap->pos;
-	return (sys_mmap(td, &nargs));
+		flags |= MAP_FIXED;
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
+	    uap->fd, uap->pos));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	size_t len;
 	int flags;
 };
 #endif
 int
-sys_msync(td, uap)
-	struct thread *td;
-	struct msync_args *uap;
+sys_msync(struct thread *td, struct msync_args *uap)
 {
+
+	return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
+}
+
+int
+kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
+{
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	int flags;
+	vm_size_t pageoff;
 	vm_map_t map;
 	int rv;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	flags = uap->flags;
-
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 		return (ENOMEM);
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 int
-sys_munmap(td, uap)
-	struct thread *td;
-	struct munmap_args *uap;
+sys_munmap(struct thread *td, struct munmap_args *uap)
 {
+
+	return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
+{
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
 #endif
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
+	vm_size_t pageoff;
 	vm_map_t map;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap...
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
 		return (EINVAL);
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
 	/*
 	 * Inform hwpmc if the address range being unmapped contains
 	 * an executable region.
 	 */
 	pkm.pm_address = (uintptr_t) NULL;
 	if (vm_map_lookup_entry(map, addr, &entry)) {
 		for (;
 		     entry != &map->header && entry->start < addr + size;
 		     entry = entry->next) {
 			if (vm_map_check_protection(map, entry->start,
 				entry->end, VM_PROT_EXECUTE) == TRUE) {
 				pkm.pm_address = (uintptr_t) addr;
 				pkm.pm_size = (size_t) size;
 				break;
 			}
 		}
 	}
 #endif
 	vm_map_delete(map, addr, addr + size);
 
 #ifdef HWPMC_HOOKS
 	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
 	vm_map_lock_downgrade(map);
 	if (pkm.pm_address != (uintptr_t) NULL)
 		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
 	vm_map_unlock_read(map);
 #else
 	vm_map_unlock(map);
 #endif
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 int
-sys_mprotect(td, uap)
-	struct thread *td;
-	struct mprotect_args *uap;
+sys_mprotect(struct thread *td, struct mprotect_args *uap)
 {
-	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	vm_prot_t prot;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot & VM_PROT_ALL;
+	return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
+}
 
+int
+kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
+{
+	vm_offset_t addr;
+	vm_size_t pageoff;
+
+	addr = addr0;
+	prot = (prot & VM_PROT_ALL);
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, prot, FALSE)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	case KERN_RESOURCE_SHORTAGE:
 		return (ENOMEM);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 int
 sys_minherit(struct thread *td, struct minherit_args *uap)
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	inherit = uap->inherit;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, inherit)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 int
 sys_madvise(struct thread *td, struct madvise_args *uap)
 {
-	vm_offset_t start, end;
+
+	return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
+}
+
+int
+kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
+{
 	vm_map_t map;
+	vm_offset_t addr, end, start;
 	int flags;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
-	if (uap->behav == MADV_PROTECT) {
+	if (behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
 	}
 
 	/*
 	 * Check for illegal behavior
 	 */
-	if (uap->behav < 0 || uap->behav > MADV_CORE)
+	if (behav < 0 || behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
-	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
-	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
+	addr = addr0;
+	if (addr < vm_map_min(map) || addr + len > vm_map_max(map))
 		return (EINVAL);
-	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
+	if ((addr + len) < addr)
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
-	start = trunc_page((vm_offset_t) uap->addr);
-	end = round_page((vm_offset_t) uap->addr + uap->len);
+	start = trunc_page(addr);
+	end = round_page(addr + len);
 
-	if (vm_map_madvise(map, start, end, uap->behav))
+	if (vm_map_madvise(map, start, end, behav))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 int
 sys_mincore(struct thread *td, struct mincore_args *uap)
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
 	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_paddr_t locked_pa;
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int mincoreinfo;
 	unsigned int timestamp;
 	boolean_t locked;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + (vm_size_t)round_page(uap->len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
 	/*
 	 * Address of byte vector
 	 */
 	vec = uap->vec;
 
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry)) {
 		vm_map_unlock_read(map);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	for (current = entry;
 	    (current != &map->header) && (current->start < end);
 	    current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
 		if (current->end < end &&
 		    (entry->next == &map->header ||
 		     current->next->start > current->end)) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			current->object.vm_object == NULL)
 			continue;
 
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		/*
 		 * scan this entry one page at a time
 		 */
 		while (addr < cend) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			object = NULL;
 			locked_pa = 0;
 		retry:
 			m = NULL;
 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
 			if (locked_pa != 0) {
 				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
 				 * other mappings might be examined.
 				 */
 				m = PHYS_TO_VM_PAGE(locked_pa);
 				if (m->object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = m->object;
 					locked = VM_OBJECT_TRYWLOCK(object);
 					vm_page_unlock(m);
 					if (!locked) {
 						VM_OBJECT_WLOCK(object);
 						vm_page_lock(m);
 						goto retry;
 					}
 				} else
 					vm_page_unlock(m);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("mincore: page %p is mapped but invalid",
 				    m));
 			} else if (mincoreinfo == 0) {
 				/*
 				 * The page is not mapped by this process.  If
 				 * the object implements managed pages, then
 				 * determine if the page is resident so that
 				 * the mappings might be examined.
 				 */
 				if (current->object.vm_object != object) {
 					if (object != NULL)
 						VM_OBJECT_WUNLOCK(object);
 					object = current->object.vm_object;
 					VM_OBJECT_WLOCK(object);
 				}
 				if (object->type == OBJT_DEFAULT ||
 				    object->type == OBJT_SWAP ||
 				    object->type == OBJT_VNODE) {
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
 					if (m == NULL &&
 					    vm_page_is_cached(object, pindex))
 						mincoreinfo = MINCORE_INCORE;
 					if (m != NULL && m->valid == 0)
 						m = NULL;
 					if (m != NULL)
 						mincoreinfo = MINCORE_INCORE;
 				}
 			}
 			if (m != NULL) {
 				/* Examine other mappings to the page. */
 				if (m->dirty == 0 && pmap_is_modified(m))
 					vm_page_dirty(m);
 				if (m->dirty != 0)
 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
 				/*
 				 * The first test for PGA_REFERENCED is an
 				 * optimization.  The second test is
 				 * required because a concurrent pmap
 				 * operation could clear the last reference
 				 * and set PGA_REFERENCED before the call to
 				 * pmap_is_referenced(). 
 				 */
 				if ((m->aflags & PGA_REFERENCED) != 0 ||
 				    pmap_is_referenced(m) ||
 				    (m->aflags & PGA_REFERENCED) != 0)
 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
 			}
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = OFF_TO_IDX(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				++lastvecindex;
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte(vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 			addr += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 	}
 
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 int
 sys_mlock(struct thread *td, struct mlock_args *uap)
 {
 
-	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+	return (kern_mlock(td->td_proc, td->td_ucred,
+	    __DECONST(uintptr_t, uap->addr), uap->len));
 }
 
 int
-vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	vm_map_t map;
 	unsigned long nsize;
 	int error;
 
 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)addr0;
+	addr = addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
 	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(proc);
 		error = racct_set(proc, RACCT_MEMLOCK, nsize);
 		PROC_UNLOCK(proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 	error = vm_map_wire(map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(proc);
 		racct_set(proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 int
 sys_mlockall(struct thread *td, struct mlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 
 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 		return (EINVAL);
 
 	/*
 	 * If wiring all pages in the process would cause it to exceed
 	 * a hard resource limit, return ENOMEM.
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		PROC_LOCK(td->td_proc);
 		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0)
 			return (ENOMEM);
 	}
 #endif
 
 	if (uap->how & MCL_FUTURE) {
 		vm_map_lock(map);
 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
 		vm_map_unlock(map);
 		error = 0;
 	}
 
 	if (uap->how & MCL_CURRENT) {
 		/*
 		 * P1003.1-2001 mandates that all currently mapped pages
 		 * will be memory resident and locked (wired) upon return
 		 * from mlockall(). vm_map_wire() will wire pages, by
 		 * calling vm_fault_wire() for each page in the region.
 		 */
 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 	}
 #ifdef RACCT
 	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlockall_args {
 	register_t dummy;
 };
 #endif
 
 int
 sys_munlockall(struct thread *td, struct munlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
 	vm_map_lock(map);
 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
 	vm_map_unlock(map);
 
 	/* Forcibly unwire all pages. */
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 int
-sys_munlock(td, uap)
-	struct thread *td;
-	struct munlock_args *uap;
+sys_munlock(struct thread *td, struct munlock_args *uap)
 {
+
+	return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
+{
 	vm_offset_t addr, end, last, start;
-	vm_size_t size;
 #ifdef RACCT
 	vm_map_t map;
 #endif
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)uap->addr;
-	size = uap->len;
+	addr = addr0;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
 	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		map = &td->td_proc->p_vmspace->vm_map;
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap_vnode()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
     boolean_t *writecounted)
 {
 	struct vattr va;
 	vm_object_t obj;
 	vm_offset_t foff;
 	struct ucred *cred;
 	int error, flags, locktype;
 
 	cred = td->td_ucred;
 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 		locktype = LK_EXCLUSIVE;
 	else
 		locktype = LK_SHARED;
 	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
+	AUDIT_ARG_VNODE1(vp);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
 	if (vp->v_type == VREG) {
 		/*
 		 * Get the proper underlying object
 		 */
 		if (obj == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode *)obj->handle;
 			/*
 			 * Bypass filesystems obey the mpsafety of the
 			 * underlying fs.  Tmpfs never bypasses.
 			 */
 			error = vget(vp, locktype, td);
 			if (error != 0)
 				return (error);
 		}
 		if (locktype == LK_EXCLUSIVE) {
 			*writecounted = TRUE;
 			vnode_pager_update_writecount(obj, 0, objsize);
 		}
 	} else {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
 	/* This relies on VM_PROT_* matching PROT_*. */
 	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 			if (prot & VM_PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
 			*maxprotp &= ~VM_PROT_WRITE;
 		}
 	}
 	/*
 	 * If it is a regular file without any references
 	 * we do not need to sync it.
 	 * Adjust object size to be the size of actual file.
 	 */
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
 	if (obj->type == OBJT_VNODE) {
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
 		if (obj == NULL) {
 			error = ENOMEM;
 			goto done;
 		}
 	} else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
 		VM_OBJECT_WLOCK(obj);
 		vm_object_reference_locked(obj);
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(obj, 0);
 #endif
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	*objp = obj;
 	*flagsp = flags;
 
 	vfs_mark_atime(vp, cred);
 
 done:
 	if (error != 0 && *writecounted) {
 		*writecounted = FALSE;
 		vnode_pager_update_writecount(obj, objsize, 0);
 	}
 	vput(vp);
 	return (error);
 }
 
 /*
  * vm_mmap_cdev()
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
 int
 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
     vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
 	int error, flags;
 
 	flags = *flagsp;
 
 	if (dsw->d_flags & D_MMAP_ANON) {
 		*objp = NULL;
 		*foff = 0;
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
 	}
 	/*
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
 	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
 	if (error != 0)
 		return (error);
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
 	 * (returns ENODEV), fall back to using the device pager.
 	 * Note that d_mmap_single() must return a reference to the
 	 * object (it needs to bump the reference count of the object
 	 * it returns somehow).
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 	    td->td_ucred);
 	if (obj == NULL)
 		return (EINVAL);
 	*objp = obj;
 	*flagsp = flags;
 	return (0);
 }
 
 /*
  * vm_mmap()
  *
  * Internal version of mmap used by exec, sys5 shared memory, and
  * various device drivers.  Handle is either a vnode pointer, a
  * character device, or NULL for MAP_ANON.
  */
 int
 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	vm_prot_t maxprot, int flags,
 	objtype_t handle_type, void *handle,
 	vm_ooffset_t foff)
 {
 	vm_object_t object;
 	struct thread *td = curthread;
 	int error;
 	boolean_t writecounted;
 
 	if (size == 0)
 		return (EINVAL);
 
 	size = round_page(size);
 	object = NULL;
 	writecounted = FALSE;
 
 	/*
 	 * Lookup/allocate object.
 	 */
 	switch (handle_type) {
 	case OBJT_DEVICE: {
 		struct cdevsw *dsw;
 		struct cdev *cdev;
 		int ref;
 
 		cdev = handle;
 		dsw = dev_refthread(cdev, &ref);
 		if (dsw == NULL)
 			return (ENXIO);
 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
 		    dsw, &foff, &object);
 		dev_relthread(cdev, ref);
 		break;
 	}
 	case OBJT_VNODE:
 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 		    handle, &foff, &object, &writecounted);
 		break;
 	case OBJT_DEFAULT:
 		if (handle == NULL) {
 			error = 0;
 			break;
 		}
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0 && object != NULL) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vnode_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 	return (error);
 }
 
 /*
  * Internal version of mmap that maps a specific VM object into an
  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
  */
 int
 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
     boolean_t writecounted, struct thread *td)
 {
 	boolean_t fitit;
 	int docow, error, findspace, rv;
 
 	if (map == &td->td_proc->p_vmspace->vm_map) {
 		PROC_LOCK(td->td_proc);
 		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
 			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (ENOMEM);
 			}
 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
 			    ptoa(pmap_wired_count(map->pmap)) + size);
 			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				return (error);
 			}
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The mmap() system call already enforces this by subtracting
 	 * the page offset from the file offset, but checking here
 	 * catches errors in device drivers (e.g. d_single_mmap()
 	 * callbacks) and other internal mapping requests (such as in
 	 * exec).
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 	}
 
 	if (flags & MAP_ANON) {
 		if (object != NULL || foff != 0)
 			return (EINVAL);
 		docow = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
 		docow = MAP_PREFAULT_PARTIAL;
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 	/* Shared memory is also shared with children. */
 	if (flags & MAP_SHARED)
 		docow |= MAP_INHERIT_SHARE;
 	if (writecounted)
 		docow |= MAP_VN_WRITECOUNT;
 	if (flags & MAP_STACK) {
 		if (object != NULL)
 			return (EINVAL);
 		docow |= MAP_STACK_GROWS_DOWN;
 	}
 	if ((flags & MAP_EXCL) != 0)
 		docow |= MAP_CHECK_EXCL;
 
 	if (fitit) {
 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 			findspace = VMFS_SUPER_SPACE;
 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 			findspace = VMFS_ALIGNED_SPACE(flags >>
 			    MAP_ALIGNMENT_SHIFT);
 		else
 			findspace = VMFS_OPTIMAL_SPACE;
 		rv = vm_map_find(map, object, foff, addr, size,
 #ifdef MAP_32BIT
 		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
 #endif
 		    0, findspace, prot, maxprot, docow);
 	} else {
 		rv = vm_map_fixed(map, object, foff, *addr, size,
 		    prot, maxprot, docow);
 	}
 
 	if (rv == KERN_SUCCESS) {
 		/*
 		 * If the process has requested that all future mappings
 		 * be wired, then heed this.
 		 */
 		if (map->flags & MAP_WIREFUTURE) {
 			vm_map_wire(map, *addr, *addr + size,
 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 		}
 	}
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * Translate a Mach VM return code to zero on success or the appropriate errno
  * on failure.
  */
 int
 vm_mmap_to_errno(int rv)
 {
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }
Index: stable/11
===================================================================
--- stable/11	(revision 314333)
+++ stable/11	(revision 314334)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r302514,302524,313352,313655,313696