Index: projects/ino64/sys/compat/freebsd32/freebsd32.h
===================================================================
--- projects/ino64/sys/compat/freebsd32/freebsd32.h	(revision 276110)
+++ projects/ino64/sys/compat/freebsd32/freebsd32.h	(revision 276111)
@@ -1,420 +1,421 @@
 /*-
  * Copyright (c) 2001 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _COMPAT_FREEBSD32_FREEBSD32_H_
 #define _COMPAT_FREEBSD32_FREEBSD32_H_
 
 #include <sys/procfs.h>
 #include <sys/socket.h>
 #include <sys/user.h>
 
 #define PTRIN(v)	(void *)(uintptr_t) (v)
 #define PTROUT(v)	(u_int32_t)(uintptr_t) (v)
 
 #define CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0)
 #define PTRIN_CP(src,dst,fld) \
 	do { (dst).fld = PTRIN((src).fld); } while (0)
 #define PTROUT_CP(src,dst,fld) \
 	do { (dst).fld = PTROUT((src).fld); } while (0)
 
 /*
  * Being a newer port, 32-bit FreeBSD/MIPS uses 64-bit time_t.
  */
 #ifdef __mips__
 typedef	int64_t	time32_t;
 #else
 typedef	int32_t	time32_t;
 #endif
 
 struct timeval32 {
 	time32_t tv_sec;
 	int32_t tv_usec;
 };
 #define TV_CP(src,dst,fld) do {			\
 	CP((src).fld,(dst).fld,tv_sec);		\
 	CP((src).fld,(dst).fld,tv_usec);	\
 } while (0)
 
 struct timespec32 {
 	time32_t tv_sec;
 	int32_t tv_nsec;
 };
 #define TS_CP(src,dst,fld) do {			\
 	CP((src).fld,(dst).fld,tv_sec);		\
 	CP((src).fld,(dst).fld,tv_nsec);	\
 } while (0)
 
 struct itimerspec32 {
 	struct timespec32  it_interval;
 	struct timespec32  it_value;
 };
 #define ITS_CP(src, dst) do {			\
 	TS_CP((src), (dst), it_interval);	\
 	TS_CP((src), (dst), it_value);		\
 } while (0)
 
 struct rusage32 {
 	struct timeval32 ru_utime;
 	struct timeval32 ru_stime;
 	int32_t	ru_maxrss;
 	int32_t	ru_ixrss;
 	int32_t	ru_idrss;
 	int32_t	ru_isrss;
 	int32_t	ru_minflt;
 	int32_t	ru_majflt;
 	int32_t	ru_nswap;
 	int32_t	ru_inblock;
 	int32_t	ru_oublock;
 	int32_t	ru_msgsnd;
 	int32_t	ru_msgrcv;
 	int32_t	ru_nsignals;
 	int32_t	ru_nvcsw;
 	int32_t	ru_nivcsw;
 };
 
 struct wrusage32 {
 	struct rusage32	wru_self;
 	struct rusage32 wru_children;
 };
 
 struct itimerval32 {
 	struct timeval32 it_interval;
 	struct timeval32 it_value;
 };
 
 #define FREEBSD4_MFSNAMELEN	16
 #define FREEBSD4_MNAMELEN	(88 - 2 * sizeof(int32_t))
 
 /* 4.x version */
 struct statfs32 {
 	int32_t	f_spare2;
 	int32_t	f_bsize;
 	int32_t	f_iosize;
 	int32_t	f_blocks;
 	int32_t	f_bfree;
 	int32_t	f_bavail;
 	int32_t	f_files;
 	int32_t	f_ffree;
 	fsid_t	f_fsid;
 	uid_t	f_owner;
 	int32_t	f_type;
 	int32_t	f_flags;
 	int32_t	f_syncwrites;
 	int32_t	f_asyncwrites;
 	char	f_fstypename[FREEBSD4_MFSNAMELEN];
 	char	f_mntonname[FREEBSD4_MNAMELEN];
 	int32_t	f_syncreads;
 	int32_t	f_asyncreads;
 	int16_t	f_spares1;
 	char	f_mntfromname[FREEBSD4_MNAMELEN];
 	int16_t	f_spares2 __packed;
 	int32_t f_spare[2];
 };
 
 struct kevent32 {
 	u_int32_t	ident;		/* identifier for this event */
 	short		filter;		/* filter for event */
 	u_short		flags;
 	u_int		fflags;
 	int32_t		data;
 	u_int32_t	udata;		/* opaque user data identifier */
 };
 
 struct iovec32 {
 	u_int32_t iov_base;
 	int	iov_len;
 };
 
 struct msghdr32 {
 	u_int32_t	 msg_name;
 	socklen_t	 msg_namelen;
 	u_int32_t	 msg_iov;
 	int		 msg_iovlen;
 	u_int32_t	 msg_control;
 	socklen_t	 msg_controllen;
 	int		 msg_flags;
 };
 
 struct stat32 {
 	dev_t	st_dev;
 	ino_t	st_ino;
 	nlink_t	st_nlink;
 	mode_t	st_mode;
 	u_int16_t st_padding0;
 	uid_t	st_uid;
 	gid_t	st_gid;
 	dev_t	st_rdev;
 	struct timespec32 st_atim;
 	struct timespec32 st_mtim;
 	struct timespec32 st_ctim;
 	struct timespec32 st_birthtim;
 	off_t	st_size;
 	int64_t	st_blocks;
 	u_int32_t st_blksize;
 	u_int32_t st_flags;
 	u_int64_t st_gen;
 	u_int64_t st_spare[10];
 };
 struct freebsd10_stat32 {
 	u_int32_t st_dev;
 	u_int32_t st_ino;
 	mode_t	st_mode;
 	u_int16_t st_nlink;
 	uid_t	st_uid;
 	gid_t	st_gid;
 	u_int32_t st_rdev;
 	struct timespec32 st_atim;
 	struct timespec32 st_mtim;
 	struct timespec32 st_ctim;
 	off_t	st_size;
 	int64_t	st_blocks;
 	u_int32_t st_blksize;
 	u_int32_t st_flags;
 	u_int32_t st_gen;
 	int32_t	st_lspare;
 	struct timespec32 st_birthtim;
 	unsigned int :(8 / 2) * (16 - (int)sizeof(struct timespec32));
 	unsigned int :(8 / 2) * (16 - (int)sizeof(struct timespec32));
 };
 
 struct ostat32 {
 	__uint16_t st_dev;
 	__uint32_t st_ino;
 	mode_t	st_mode;
 	__uint16_t st_nlink;
 	__uint16_t st_uid;
 	__uint16_t st_gid;
 	__uint16_t st_rdev;
 	__int32_t st_size;
 	struct timespec32 st_atim;
 	struct timespec32 st_mtim;
 	struct timespec32 st_ctim;
 	__int32_t st_blksize;
 	__int32_t st_blocks;
 	u_int32_t st_flags;
 	__uint32_t st_gen;
 };
 
 struct jail32_v0 {
 	u_int32_t	version;
 	uint32_t	path;
 	uint32_t	hostname;
 	u_int32_t	ip_number;
 };
 
 struct jail32 {
 	uint32_t	version;
 	uint32_t	path;
 	uint32_t	hostname;
 	uint32_t	jailname;
 	uint32_t	ip4s;
 	uint32_t	ip6s;
 	uint32_t	ip4;
 	uint32_t	ip6;
 };
 
 struct sigaction32 {
 	u_int32_t	sa_u;
 	int		sa_flags;
 	sigset_t	sa_mask;
 };
 
 struct thr_param32 {
 	uint32_t start_func;
 	uint32_t arg;
 	uint32_t stack_base;
 	uint32_t stack_size;
 	uint32_t tls_base;
 	uint32_t tls_size;
 	uint32_t child_tid;
 	uint32_t parent_tid;
 	int32_t	 flags;
 	uint32_t rtp;
 	uint32_t spare[3];
 };
 
 struct i386_ldt_args32 {
 	uint32_t start;
 	uint32_t descs;
 	uint32_t num;
 };
 
 /*
  * Alternative layouts for <sys/procfs.h>
  */
 struct prstatus32 {
         int     pr_version;
         u_int   pr_statussz;
         u_int   pr_gregsetsz;
         u_int   pr_fpregsetsz;
         int     pr_osreldate;
         int     pr_cursig;
         pid_t   pr_pid;
         struct reg32 pr_reg;
 };
 
 struct prpsinfo32 {
         int     pr_version;
         u_int   pr_psinfosz;
         char    pr_fname[PRFNAMESZ+1];
         char    pr_psargs[PRARGSZ+1];
 };
 
 struct thrmisc32 {
         char    pr_tname[MAXCOMLEN+1];
         u_int   _pad;
 };
 
 struct mq_attr32 {
 	int	mq_flags;
 	int	mq_maxmsg;
 	int	mq_msgsize;
 	int	mq_curmsgs;
 	int	__reserved[4];
 };
 
 struct kinfo_proc32 {
 	int	ki_structsize;
 	int	ki_layout;
 	uint32_t ki_args;
 	uint32_t ki_paddr;
 	uint32_t ki_addr;
 	uint32_t ki_tracep;
 	uint32_t ki_textvp;
 	uint32_t ki_fd;
 	uint32_t ki_vmspace;
 	uint32_t ki_wchan;
 	pid_t	ki_pid;
 	pid_t	ki_ppid;
 	pid_t	ki_pgid;
 	pid_t	ki_tpgid;
 	pid_t	ki_sid;
 	pid_t	ki_tsid;
 	short	ki_jobc;
 	short	ki_spare_short1;
-	dev_t	ki_tdev;
+	uint32_t ki_tdev_freebsd10;
 	sigset_t ki_siglist;
 	sigset_t ki_sigmask;
 	sigset_t ki_sigignore;
 	sigset_t ki_sigcatch;
 	uid_t	ki_uid;
 	uid_t	ki_ruid;
 	uid_t	ki_svuid;
 	gid_t	ki_rgid;
 	gid_t	ki_svgid;
 	short	ki_ngroups;
 	short	ki_spare_short2;
 	gid_t 	ki_groups[KI_NGROUPS];
 	uint32_t ki_size;
 	int32_t ki_rssize;
 	int32_t ki_swrss;
 	int32_t ki_tsize;
 	int32_t ki_dsize;
 	int32_t ki_ssize;
 	u_short	ki_xstat;
 	u_short	ki_acflag;
 	fixpt_t	ki_pctcpu;
 	u_int	ki_estcpu;
 	u_int	ki_slptime;
 	u_int	ki_swtime;
 	u_int	ki_cow;
 	u_int64_t ki_runtime;
 	struct	timeval32 ki_start;
 	struct	timeval32 ki_childtime;
 	int	ki_flag;
 	int	ki_kiflag;
 	int	ki_traceflag;
 	char	ki_stat;
 	signed char ki_nice;
 	char	ki_lock;
 	char	ki_rqindex;
 	u_char	ki_oncpu_old;
 	u_char	ki_lastcpu_old;
 	char	ki_tdname[TDNAMLEN+1];
 	char	ki_wmesg[WMESGLEN+1];
 	char	ki_login[LOGNAMELEN+1];
 	char	ki_lockname[LOCKNAMELEN+1];
 	char	ki_comm[COMMLEN+1];
 	char	ki_emul[KI_EMULNAMELEN+1];
 	char	ki_loginclass[LOGINCLASSLEN+1];
 	char	ki_sparestrings[50];
 	int	ki_spareints[KI_NSPARE_INT];
+	uint64_t ki_tdev;
 	int	ki_oncpu;
 	int	ki_lastcpu;
 	int	ki_tracer;
 	int	ki_flag2;
 	int	ki_fibnum;
 	u_int	ki_cr_flags;
 	int	ki_jid;
 	int	ki_numthreads;
 	lwpid_t	ki_tid;
 	struct	priority ki_pri;
 	struct	rusage32 ki_rusage;
 	struct	rusage32 ki_rusage_ch;
 	uint32_t ki_pcb;
 	uint32_t ki_kstack;
 	uint32_t ki_udata;
 	uint32_t ki_tdaddr;
 	uint32_t ki_spareptrs[KI_NSPARE_PTR];	/* spare room for growth */
 	int	ki_sparelongs[KI_NSPARE_LONG];
 	int	ki_sflag;
 	int	ki_tdflags;
 };
 
 struct kinfo_sigtramp32 {
 	uint32_t ksigtramp_start;
 	uint32_t ksigtramp_end;
 	uint32_t ksigtramp_spare[4];
 };
 
 struct kld32_file_stat_1 {
 	int	version;	/* set to sizeof(struct kld_file_stat_1) */
 	char	name[MAXPATHLEN];
 	int	refs;
 	int	id;
 	uint32_t address;	/* load address */
 	uint32_t size;		/* size in bytes */
 };
 
 struct kld32_file_stat {
 	int	version;	/* set to sizeof(struct kld_file_stat) */
 	char	name[MAXPATHLEN];
 	int	refs;
 	int	id;
 	uint32_t address;	/* load address */
 	uint32_t size;		/* size in bytes */
 	char	pathname[MAXPATHLEN];
 };
 
 struct procctl_reaper_pids32 {
 	u_int	rp_count;
 	u_int	rp_pad0[15];
 	uint32_t rp_pids;
 };
 
 #endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */
Index: projects/ino64/sys/kern/kern_descrip.c
===================================================================
--- projects/ino64/sys/kern/kern_descrip.c	(revision 276110)
+++ projects/ino64/sys/kern/kern_descrip.c	(revision 276111)
@@ -1,3834 +1,3839 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static uma_zone_t file_zone;
 static uma_zone_t filedesc0_zone;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, int holdleaders);
 static int	do_dup(struct thread *td, int flags, int old, int new);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static int	fd_last_used(struct filedesc *fdp, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct proc *p);
 
 /* Flags for do_dup() */
 #define	DUP_FIXED	0x1	/* Force fixed allocation. */
 #define	DUP_FCNTL	0x2	/* fcntl()-style errors. */
 #define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx fdesc_mtx;
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  * not exceeding size - 1. Return -1 if not found.
  */
 static int
 fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 #ifdef INVARIANTS
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 #endif
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fdefree_last(fde);
 	bzero(fde, fde_change_size);
 	fdunused(fdp, fd);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 	uint64_t lim;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(p);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error;
 
 	error = 0;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 		    cmd = F_GETLK;
 		    break;
 		case F_OSETLK:
 		    cmd = F_SETLK;
 		    break;
 		case F_OSETLKW:
 		    cmd = F_SETLKW;
 		    break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
         case F_GETLK:
         case F_SETLK:
         case F_SETLKW:
 	case F_SETLK_REMOTE:
                 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
                 arg1 = (intptr_t)&fl;
                 break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, cmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error, flg, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = do_dup(td, DUP_FCNTL, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
 		if (fget_locked(fdp, fd) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		td->td_retval[0] =
 		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
 		if (fget_locked(fdp, fd) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		fde = &fdp->fd_ofiles[fd];
 		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		cap_rights_init(&rights, CAP_FLOCK);
 		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			/*
 			 * Temporary api for testing remote lock
 			 * infrastructure.
 			 */
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     foffset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct proc *p)
 {
 	int maxfd;
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	return (maxfd);
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 static int
 do_dup(struct thread *td, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(p);
 	if (new >= maxfd)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	oldfde = &fdp->fd_ofiles[old];
 	if (flags & DUP_FIXED && old == new) {
 		td->td_retval[0] = new;
 		if (flags & DUP_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
 	fp = oldfde->fde_file;
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	if (flags & DUP_FIXED) {
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			PROC_LOCK(p);
 			error = racct_set(p, RACCT_NOFILE, new + 1);
 			PROC_UNLOCK(p);
 			if (error != 0) {
 				FILEDESC_XUNLOCK(fdp);
 				fdrop(fp, td);
 				return (EMFILE);
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 			oldfde = &fdp->fd_ofiles[old];
 		}
 		newfde = &fdp->fd_ofiles[new];
 		if (newfde->fde_file == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 		newfde = &fdp->fd_ofiles[new];
 	}
 
 	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
 	KASSERT(old != new, ("new fd is same as old"));
 
 	delfp = newfde->fde_file;
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seq_write_begin(&newfde->fde_seq);
 #endif
 	filecaps_free(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 	if ((flags & DUP_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seq_write_end(&newfde->fde_seq);
 #endif
 	td->td_retval[0] = new;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, 1);
 		/* closefp() drops the FILEDESC lock for us. */
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		free(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Function drops the filedesc lock on return.
  */
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     int holdleaders)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = 0;
 		}
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 {
 	struct filedesc *fdp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (0);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrstat(sbp);
 #endif
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 	if (error != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Initialize filecaps structure.
  */
 void
 filecaps_init(struct filecaps *fcaps)
 {
 
 	bzero(fcaps, sizeof(*fcaps));
 	fcaps->fc_nioctls = -1;
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  */
 void
 filecaps_copy(const struct filecaps *src, struct filecaps *dst)
 {
 	size_t size;
 
 	*dst = *src;
 	if (src->fc_ioctls != NULL) {
 		KASSERT(src->fc_nioctls > 0,
 		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 	}
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	/*
 	 * If lastfile is -1 this struct filedesc was just allocated and we are
 	 * growing it to accomodate for the one we are going to copy from. There
 	 * is no need to have a lock on this one as it's not visible to anyone.
 	 */
 	if (fdp->fd_lastfile != -1)
 		FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Do not free the old file table, as some threads may still
 	 * reference entries within it.  Instead, place it on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
 		ft->ft_table = otable;
 		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(p);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (fd >= maxfd)
 		return (EMFILE);
 	if (fd >= fdp->fd_nfiles) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_NOFILE, allocfd);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EMFILE);
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file decriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
 {
 	struct file *fp;
 	int error, fd;
 
 	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);		/* no reference held on error */
 
 	error = finstall(td, fp, &fd, flags, NULL);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
 	}
 
 	if (resultfp != NULL)
 		*resultfp = fp;		/* copy out result */
 	else
 		fdrop(fp, td);		/* release local reference */
 
 	if (resultfd != NULL)
 		*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 falloc_noinstall(struct thread *td, struct file **resultfp)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
 		}
 		return (ENFILE);
 	}
 	atomic_add_int(&openfiles, 1);
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct filedescent *fde;
 	int error;
 
 	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
 	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	fhold(fp);
 	fde = &fdp->fd_ofiles[*fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fde->fde_file = fp;
 	if ((flags & O_CLOEXEC) != 0)
 		fde->fde_flags |= UF_EXCLOSE;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	newfdp->fd_refcnt = 1;
 	newfdp->fd_holdcnt = 1;
 	newfdp->fd_cmask = CMASK;
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_lastfile = -1;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 
 	FILEDESC_SLOCK(fdp);
 	newfdp->fd_cdir = fdp->fd_cdir;
 	if (newfdp->fd_cdir)
 		VREF(newfdp->fd_cdir);
 	newfdp->fd_rdir = fdp->fd_rdir;
 	if (newfdp->fd_rdir)
 		VREF(newfdp->fd_rdir);
 	newfdp->fd_jdir = fdp->fd_jdir;
 	if (newfdp->fd_jdir)
 		VREF(newfdp->fd_jdir);
 
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 	} else {
 		while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 			FILEDESC_SUNLOCK(fdp);
 			fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 
 	return (newfdp);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	int i;
 
 	if (fdp->fd_holdcnt > 1) {
 		mtx_lock(&fdesc_mtx);
 		i = --fdp->fd_holdcnt;
 		mtx_unlock(&fdesc_mtx);
 		if (i > 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	FILEDESC_XLOCK(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_XUNLOCK(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (p->p_fd->fd_refcnt == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		for (i = 0; i <= fdp->fd_lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE)
 				continue;
 			fhold(fp);
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or do_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdescfree(struct thread *td)
 {
 	struct filedesc0 *fdp0;
 	struct filedesc *fdp;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	PROC_LOCK(td->td_proc);
 	racct_set(td->td_proc, RACCT_NOFILE, 0);
 	PROC_UNLOCK(td->td_proc);
 #endif
 
 	if (td->td_proc->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	FILEDESC_XLOCK(fdp);
 	i = --fdp->fd_refcnt;
 	if (i > 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return;
 	}
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			(void) closef(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	if (cdir != NULL)
 		vrele(cdir);
 	if (rdir != NULL)
 		vrele(rdir);
 	if (jdir != NULL)
 		vrele(jdir);
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, 0);
 			/* closefp() drops the FILEDESC lock. */
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = do_dup(td, DUP_FIXED, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
 {
 #ifdef CAPABILITIES
 	struct filedescent fde;
 #endif
 	struct fdescenttbl *fdt;
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
 	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if (fd < 0 || fd >= fdt->fdt_nfiles)
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seq_read(fd_seq(fdt, fd));
 		fde = fdt->fdt_ofiles[fd];
 		if (!seq_consistent(fd_seq(fdt, fd), seq)) {
 			cpu_spinwait();
 			continue;
 		}
 		fp = fde.fde_file;
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		haverights = *cap_rights_fde(&fde);
 		if (needrightsp != NULL) {
 			error = cap_check(&haverights, needrightsp);
 			if (error != 0)
 				return (error);
 			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
 				error = cap_fcntl_check_fde(&fde, needfcntl);
 				if (error != 0)
 					return (error);
 			}
 		}
 #endif
 	retry:
 		count = fp->f_count;
 		if (count == 0) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it possible that
 			 * there is a stale fp pointer in cached version.
 			 */
 			fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)
 			goto retry;
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (haverightsp != NULL) {
 #ifdef CAPABILITIES
 		*haverightsp = haverights;
 #else
 		CAP_ALL(haverightsp);
 #endif
 	}
 	return (0);
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occured the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp, u_char *maxprotp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	cap_rights_t haverights, needrights;
 	int error;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	if (needrightsp != NULL)
 		needrights = *needrightsp;
 	else
 		cap_rights_init(&needrights);
 	if (maxprotp != NULL)
 		cap_rights_set(&needrights, CAP_MMAP);
 	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&haverights);
 #else /* !CAPABILITIES */
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #endif /* CAPABILITIES */
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
     struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 #ifdef CAPABILITIES
 	int error;
 #endif
 
 	fdp = td->td_proc->p_fd;
 	fp = fget_locked(fdp, fd);
 	if (fp == NULL || fp->f_ops == &badfileops)
 		return (EBADF);
 
 #ifdef CAPABILITIES
 	if (needrightsp != NULL) {
 		error = cap_check(cap_rights(fdp, fd), needrightsp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 
 	if (fp->f_vnode == NULL)
 		return (EINVAL);
 
 	*vpp = fp->f_vnode;
 	vref(*vpp);
 	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
 
 	return (0);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if the
  * descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
  * in the future.
  *
  * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being free'd
  * during use.
  */
 int
 fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
     u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * Note: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	error = 0;
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fhold(fp);
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		bzero(oldfde, fde_change_size);
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_jdir == olddp) {
 			vref(newdp);
 			fdp->fd_jdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vref(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vref(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vref(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			if (p->p_state == PRS_NEW)
 				continue;
 			fdp = fdhold(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 static void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = fp->f_count;
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	pack_kinfo(kif);
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	pack_kinfo(kif);
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SUNLOCK(efbuf->fdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int error, i;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vref(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vref(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vref(cttyvp);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vref(fdp->fd_cdir);
 		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vref(fdp->fd_rdir);
 		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vref(fdp->fd_jdir);
 		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 	}
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		cap_rights_init(&rights);
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	okif->kf_vnode_type = kif->kf_vnode_type;
 	okif->kf_sock_domain = kif->kf_sock_domain;
 	okif->kf_sock_type = kif->kf_sock_type;
 	okif->kf_sock_protocol = kif->kf_sock_protocol;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
-	okif->kf_sa_local = kif->kf_sa_local;
-	okif->kf_sa_peer = kif->kf_sa_peer;
+	if (kif->kf_type == KF_TYPE_SOCKET) {
+		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
+		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
+	} else {
+		okif->kf_sa_local.ss_family = AF_UNSPEC;
+		okif->kf_sa_peer.ss_family = AF_UNSPEC;
+	}
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
 {
 	int error;
 
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	export_vnode_to_kinfo(vp, type, 0, kif);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct file *fp;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 		    okif, fdp, req);
 	if (fdp->fd_rdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 		    okif, fdp, req);
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 		    okif, fdp, req);
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = fdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir == NULL)
 		error = EINVAL;
 	else {
 		vref(fdp->fd_cdir);
 		error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
 		    FREAD, efbuf);
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Index: projects/ino64/sys/kern/kern_proc.c
===================================================================
--- projects/ino64/sys/kern/kern_proc.c	(revision 276110)
+++ projects/ino64/sys/kern/kern_proc.c	(revision 276111)
@@ -1,3037 +1,3048 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/elf.h>
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/stack.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sdt.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/vnode.h>
 #include <sys/eventhandler.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 SDT_PROVIDER_DEFINE(proc);
 SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, "struct proc *", "int",
     "void *", "int");
 SDT_PROBE_DEFINE4(proc, kernel, ctor, return, "struct proc *", "int",
     "void *", "int");
 SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, "struct proc *", "int",
     "void *", "struct thread *");
 SDT_PROBE_DEFINE3(proc, kernel, dtor, return, "struct proc *", "int",
     "void *");
 SDT_PROBE_DEFINE3(proc, kernel, init, entry, "struct proc *", "int",
     "int");
 SDT_PROBE_DEFINE3(proc, kernel, init, return, "struct proc *", "int",
     "int");
 
 MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
 static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp,
     int preferthread);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
 static void proc_dtor(void *mem, int size, void *arg);
 static int proc_init(void *mem, int size, int flags);
 static void proc_fini(void *mem, int size);
 static void pargs_free(struct pargs *pa);
 static struct proc *zpfind_locked(pid_t pid);
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 
 int kstack_pages = KSTACK_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
     "Kernel stack size in pages");
 static int vmmap_skip_res_cnt = 0;
 SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW,
     &vmmap_skip_res_cnt, 0,
     "Skip calculation of the pages resident count in kern.proc.vmmap");
 
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #ifdef COMPAT_FREEBSD32
 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
 #endif
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
 	    proc_ctor, proc_dtor, proc_init, proc_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uihashinit();
 }
 
 /*
  * Prepare a proc for use.
  */
 static int
 proc_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
 	EVENTHANDLER_INVOKE(process_ctor, p);
 	SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
 	return (0);
 }
 
 /*
  * Reclaim a proc after use.
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
 	td = FIRST_THREAD_IN_PROC(p);
 	SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
 	if (td != NULL) {
 #ifdef INVARIANTS
 		KASSERT((p->p_numthreads == 1),
 		    ("bad number of threads in exiting process"));
 		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
 #endif
 		/* Free all OSD associated to this thread. */
 		osd_thread_exit(td);
 	}
 	EVENTHANDLER_INVOKE(process_dtor, p);
 	if (p->p_ksi != NULL)
 		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
 	SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
 }
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
  */
 static int
 proc_init(void *mem, int size, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
 	p->p_sched = (struct p_sched *)&p[1];
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN);
 	mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN);
 	mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN);
 	mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN);
 	cv_init(&p->p_pwait, "ppwait");
 	cv_init(&p->p_dbgwait, "dbgwait");
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	EVENTHANDLER_INVOKE(process_init, p);
 	p->p_stats = pstats_alloc();
 	SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
 	return (0);
 }
 
 /*
  * UMA should ensure that this function is never called.
  * Freeing a proc structure would violate type stability.
  */
 static void
 proc_fini(void *mem, int size)
 {
 #ifdef notnow
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	EVENTHANDLER_INVOKE(process_fini, p);
 	pstats_free(p->p_stats);
 	thread_free(FIRST_THREAD_IN_PROC(p));
 	mtx_destroy(&p->p_mtx);
 	if (p->p_ksi != NULL)
 		ksiginfo_free(p->p_ksi);
 #else
 	panic("proc reclaimed");
 #endif
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(struct proc *p)
 {
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (; p != curproc; p = proc_realparent(p)) {
 		if (p->p_pid == 0)
 			return (0);
 	}
 	return (1);
 }
 
 struct proc *
 pfind_locked(pid_t pid)
 {
 	struct proc *p;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				p = NULL;
 			}
 			break;
 		}
 	}
 	return (p);
 }
 
 /*
  * Locate a process by number; return only "live" processes -- i.e., neither
  * zombies nor newly born but incompletely initialized processes.  By not
  * returning processes in the PRS_NEW state, we allow callers to avoid
  * testing for that condition to avoid dereferencing p_ucred, et al.
  */
 struct proc *
 pfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	p = pfind_locked(pid);
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 static struct proc *
 pfind_tid_locked(pid_t tid)
 {
 	struct proc *p;
 	struct thread *td;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (td->td_tid == tid)
 				goto found;
 		}
 		PROC_UNLOCK(p);
 	}
 found:
 	return (p);
 }
 
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
 		if (pgrp->pg_id == pgid) {
 			PGRP_LOCK(pgrp);
 			return (pgrp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Locate process and do additional manipulations, depending on flags.
  */
 int
 pget(pid_t pid, int flags, struct proc **pp)
 {
 	struct proc *p;
 	int error;
 
 	sx_slock(&allproc_lock);
 	if (pid <= PID_MAX) {
 		p = pfind_locked(pid);
 		if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
 			p = zpfind_locked(pid);
 	} else if ((flags & PGET_NOTID) == 0) {
 		p = pfind_tid_locked(pid);
 	} else {
 		p = NULL;
 	}
 	sx_sunlock(&allproc_lock);
 	if (p == NULL)
 		return (ESRCH);
 	if ((flags & PGET_CANSEE) != 0) {
 		error = p_cansee(curthread, p);
 		if (error != 0)
 			goto errout;
 	}
 	if ((flags & PGET_CANDEBUG) != 0) {
 		error = p_candebug(curthread, p);
 		if (error != 0)
 			goto errout;
 	}
 	if ((flags & PGET_ISCURRENT) != 0 && curproc != p) {
 		error = EPERM;
 		goto errout;
 	}
 	if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
 		goto errout;
 	}
 	if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) {
 		/*
 		 * XXXRW: Not clear ESRCH is the right error during proc
 		 * execve().
 		 */
 		error = ESRCH;
 		goto errout;
 	}
 	if ((flags & PGET_HOLD) != 0) {
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 	}
 	*pp = p;
 	return (0);
 errout:
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Create a new process group.
  * pgid must be equal to the pid of p.
  * Begin a new session if required.
  */
 int
 enterpgrp(p, pgid, pgrp, sess)
 	register struct proc *p;
 	pid_t pgid;
 	struct pgrp *pgrp;
 	struct session *sess;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
 	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
 	KASSERT(p->p_pid == pgid,
 	    ("enterpgrp: new pgrp and pid != pgid"));
 	KASSERT(pgfind(pgid) == NULL,
 	    ("enterpgrp: pgrp with pgid exists"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 
 	if (sess != NULL) {
 		/*
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pgrp);
 		sess->s_leader = p;
 		sess->s_sid = p->p_pid;
 		refcount_init(&sess->s_count, 1);
 		sess->s_ttyvp = NULL;
 		sess->s_ttydp = NULL;
 		sess->s_ttyp = NULL;
 		bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 		pgrp->pg_session = sess;
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
 		pgrp->pg_session = p->p_session;
 		sess_hold(pgrp->pg_session);
 		PGRP_LOCK(pgrp);
 	}
 	pgrp->pg_id = pgid;
 	LIST_INIT(&pgrp->pg_members);
 
 	/*
 	 * As we have an exclusive lock of proctree_lock,
 	 * this should not deadlock.
 	 */
 	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to an existing process group
  */
 int
 enterthispgrp(p, pgrp)
 	register struct proc *p;
 	struct pgrp *pgrp;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 	KASSERT(pgrp->pg_session == p->p_session,
 		("%s: pgrp's session %p, p->p_session %p.\n",
 		__func__,
 		pgrp->pg_session,
 		p->p_session));
 	KASSERT(pgrp != p->p_pgrp,
 		("%s: p belongs to pgrp.", __func__));
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to a process group
  */
 static void
 doenterpgrp(p, pgrp)
 	struct proc *p;
 	struct pgrp *pgrp;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 
 	savepgrp = p->p_pgrp;
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = pgrp;
 	PROC_UNLOCK(p);
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	savepgrp = p->p_pgrp;
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = NULL;
 	PROC_UNLOCK(p);
 	PGRP_UNLOCK(savepgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 	struct session *savesess;
 	struct tty *tp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	PGRP_LOCK(pgrp);
 	tp = pgrp->pg_session->s_ttyp;
 	LIST_REMOVE(pgrp, pg_hash);
 	savesess = pgrp->pg_session;
 	PGRP_UNLOCK(pgrp);
 
 	/* Remove the reference to the pgrp before deallocating it. */
 	if (tp != NULL) {
 		tty_lock(tp);
 		tty_rel_pgrp(tp, pgrp);
 	}
 
 	mtx_destroy(&pgrp->pg_mtx);
 	free(pgrp, M_PGRP);
 	sess_release(savesess);
 }
 
 static void
 pgadjustjobc(pgrp, entering)
 	struct pgrp *pgrp;
 	int entering;
 {
 
 	PGRP_LOCK(pgrp);
 	if (entering)
 		pgrp->pg_jobc++;
 	else {
 		--pgrp->pg_jobc;
 		if (pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 	PGRP_UNLOCK(pgrp);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	mysession = pgrp->pg_session;
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession)
 		pgadjustjobc(pgrp, entering);
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		hispgrp = p->p_pgrp;
 		if (hispgrp == pgrp ||
 		    hispgrp->pg_session != mysession)
 			continue;
 		PROC_LOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		pgadjustjobc(hispgrp, entering);
 	}
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	PGRP_LOCK_ASSERT(pg, MA_OWNED);
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		PROC_LOCK(p);
 		if (P_SHOULDSTOP(p)) {
 			PROC_UNLOCK(p);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
 				kern_psignal(p, SIGHUP);
 				kern_psignal(p, SIGCONT);
 				PROC_UNLOCK(p);
 			}
 			return;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 sess_hold(struct session *s)
 {
 
 	refcount_acquire(&s->s_count);
 }
 
 void
 sess_release(struct session *s)
 {
 
 	if (refcount_release(&s->s_count)) {
 		if (s->s_ttyp != NULL) {
 			tty_lock(s->s_ttyp);
 			tty_rel_sess(s->s_ttyp, s);
 		}
 		mtx_destroy(&s->s_mtx);
 		free(s, M_SESSION);
 	}
 }
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  * Calculate the kinfo_proc members which contain process-wide
  * informations.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	kp->ki_estcpu = 0;
 	kp->ki_pctcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		kp->ki_pctcpu += sched_pctcpu(td);
 		kp->ki_estcpu += td->td_estcpu;
 		thread_unlock(td);
 	}
 }
 
 /*
  * Clear kinfo_proc and fill in any information that is common
  * to all threads in the process.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td0;
 	struct tty *tp;
 	struct session *sp;
 	struct ucred *cred;
 	struct sigacts *ps;
 
 	/* For proc_realparent. */
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	kp->ki_addr =/* p->p_addr; */0; /* XXX */
 	kp->ki_args = p->p_args;
 	kp->ki_textvp = p->p_textvp;
 #ifdef KTRACE
 	kp->ki_tracep = p->p_tracevp;
 	kp->ki_traceflag = p->p_traceflag;
 #endif
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	kp->ki_flag = p->p_flag;
 	kp->ki_flag2 = p->p_flag2;
 	cred = p->p_ucred;
 	if (cred) {
 		kp->ki_uid = cred->cr_uid;
 		kp->ki_ruid = cred->cr_ruid;
 		kp->ki_svuid = cred->cr_svuid;
 		kp->ki_cr_flags = 0;
 		if (cred->cr_flags & CRED_FLAG_CAPMODE)
 			kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
 		/* XXX bde doesn't like KI_NGROUPS */
 		if (cred->cr_ngroups > KI_NGROUPS) {
 			kp->ki_ngroups = KI_NGROUPS;
 			kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
 		} else
 			kp->ki_ngroups = cred->cr_ngroups;
 		bcopy(cred->cr_groups, kp->ki_groups,
 		    kp->ki_ngroups * sizeof(gid_t));
 		kp->ki_rgid = cred->cr_rgid;
 		kp->ki_svgid = cred->cr_svgid;
 		/* If jailed(cred), emulate the old P_JAILED flag. */
 		if (jailed(cred)) {
 			kp->ki_flag |= P_JAILED;
 			/* If inside the jail, use 0 as a jail ID. */
 			if (cred->cr_prison != curthread->td_ucred->cr_prison)
 				kp->ki_jid = cred->cr_prison->pr_id;
 		}
 		strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
 		    sizeof(kp->ki_loginclass));
 	}
 	ps = p->p_sigacts;
 	if (ps) {
 		mtx_lock(&ps->ps_mtx);
 		kp->ki_sigignore = ps->ps_sigignore;
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		FOREACH_THREAD_IN_PROC(p, td0) {
 			if (!TD_IS_SWAPPED(td0))
 				kp->ki_rssize += td0->td_kstack_pages;
 		}
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	} else if (p->p_state == PRS_ZOMBIE)
 		kp->ki_stat = SZOMB;
 	if (kp->ki_flag & P_INMEM)
 		kp->ki_sflag = PS_INMEM;
 	else
 		kp->ki_sflag = 0;
 	/* Calculate legacy swtime as seconds since 'swtick'. */
 	kp->ki_swtime = (ticks - p->p_swtick) / hz;
 	kp->ki_pid = p->p_pid;
 	kp->ki_nice = p->p_nice;
 	kp->ki_fibnum = p->p_fibnum;
 	kp->ki_start = p->p_stats->p_start;
 	timevaladd(&kp->ki_start, &boottime);
 	PROC_STATLOCK(p);
 	rufetch(p, &kp->ki_rusage);
 	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
 	calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
 	PROC_STATUNLOCK(p);
 	calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 	/* Some callers want child times in a single value. */
 	kp->ki_childtime = kp->ki_childstime;
 	timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		kp->ki_cow += td0->td_cow;
 
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			SESS_LOCK(sp);
 			strlcpy(kp->ki_login, sp->s_login,
 			    sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag |= KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 			/* XXX proctree_lock */
 			tp = sp->s_ttyp;
 			SESS_UNLOCK(sp);
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
 		kp->ki_tdev = tty_udev(tp);
+		kp->ki_tdev_freebsd10 = kp->ki_tdev; /* truncate */
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
-	} else
+	} else {
 		kp->ki_tdev = NODEV;
+		kp->ki_tdev_freebsd10 = kp->ki_tdev; /* truncate */
+	}
 	if (p->p_comm[0] != '\0')
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
 	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
 	    p->p_sysent->sv_name[0] != '\0')
 		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
 	kp->ki_siglist = p->p_siglist;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_lock = p->p_lock;
 	if (p->p_pptr) {
 		kp->ki_ppid = proc_realparent(p)->p_pid;
 		if (p->p_flag & P_TRACED)
 			kp->ki_tracer = p->p_pptr->p_pid;
 	}
 }
 
 /*
  * Fill in information that is thread specific.  Must be called with
  * target process locked.  If 'preferthread' is set, overwrite certain
  * process-related fields that are maintained for both threads and
  * processes.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	kp->ki_tdaddr = td;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (preferthread)
 		PROC_STATLOCK(p);
 	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
 		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
 	strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
 	if (TD_ON_LOCK(td)) {
 		kp->ki_kiflag |= KI_LOCKBLOCK;
 		strlcpy(kp->ki_lockname, td->td_lockname,
 		    sizeof(kp->ki_lockname));
 	} else {
 		kp->ki_kiflag &= ~KI_LOCKBLOCK;
 		bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
 	}
 
 	if (p->p_state == PRS_NORMAL) { /* approximate. */
 		if (TD_ON_RUNQ(td) ||
 		    TD_CAN_RUN(td) ||
 		    TD_IS_RUNNING(td)) {
 			kp->ki_stat = SRUN;
 		} else if (P_SHOULDSTOP(p)) {
 			kp->ki_stat = SSTOP;
 		} else if (TD_IS_SLEEPING(td)) {
 			kp->ki_stat = SSLEEP;
 		} else if (TD_ON_LOCK(td)) {
 			kp->ki_stat = SLOCK;
 		} else {
 			kp->ki_stat = SWAIT;
 		}
 	} else if (p->p_state == PRS_ZOMBIE) {
 		kp->ki_stat = SZOMB;
 	} else {
 		kp->ki_stat = SIDL;
 	}
 
 	/* Things in the thread */
 	kp->ki_wchan = td->td_wchan;
 	kp->ki_pri.pri_level = td->td_priority;
 	kp->ki_pri.pri_native = td->td_base_pri;
 
 	/*
 	 * Note: legacy fields; clamp at the old NOCPU value and/or
 	 * the maximum u_char CPU value.
 	 */
 	if (td->td_lastcpu == NOCPU)
 		kp->ki_lastcpu_old = NOCPU_OLD;
 	else if (td->td_lastcpu > MAXCPU_OLD)
 		kp->ki_lastcpu_old = MAXCPU_OLD;
 	else
 		kp->ki_lastcpu_old = td->td_lastcpu;
 
 	if (td->td_oncpu == NOCPU)
 		kp->ki_oncpu_old = NOCPU_OLD;
 	else if (td->td_oncpu > MAXCPU_OLD)
 		kp->ki_oncpu_old = MAXCPU_OLD;
 	else
 		kp->ki_oncpu_old = td->td_oncpu;
 
 	kp->ki_lastcpu = td->td_lastcpu;
 	kp->ki_oncpu = td->td_oncpu;
 	kp->ki_tdflags = td->td_flags;
 	kp->ki_tid = td->td_tid;
 	kp->ki_numthreads = p->p_numthreads;
 	kp->ki_pcb = td->td_pcb;
 	kp->ki_kstack = (void *)td->td_kstack;
 	kp->ki_slptime = (ticks - td->td_slptick) / hz;
 	kp->ki_pri.pri_class = td->td_pri_class;
 	kp->ki_pri.pri_user = td->td_user_pri;
 
 	if (preferthread) {
 		rufetchtd(td, &kp->ki_rusage);
 		kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
 		kp->ki_pctcpu = sched_pctcpu(td);
 		kp->ki_estcpu = td->td_estcpu;
 		kp->ki_cow = td->td_cow;
 	}
 
 	/* We can't get this anymore but ps etc never used it anyway. */
 	kp->ki_rqindex = 0;
 
 	if (preferthread)
 		kp->ki_siglist = td->td_siglist;
 	kp->ki_sigmask = td->td_sigmask;
 	thread_unlock(td);
 	if (preferthread)
 		PROC_STATUNLOCK(p);
 }
 
 /*
  * Fill in a kinfo_proc structure for the specified process.
  * Must be called with the target process locked.
  */
 void
 fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
 {
 
 	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
 
 	fill_kinfo_proc_only(p, kp);
 	fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
 	fill_kinfo_aggregate(p, kp);
 }
 
 struct pstats *
 pstats_alloc(void)
 {
 
 	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
 }
 
 /*
  * Copy parts of p_stats; zero the rest of p_stats (statistics).
  */
 void
 pstats_fork(struct pstats *src, struct pstats *dst)
 {
 
 	bzero(&dst->pstat_startzero,
 	    __rangeof(struct pstats, pstat_startzero, pstat_endzero));
 	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
 	    __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
 }
 
 void
 pstats_free(struct pstats *ps)
 {
 
 	free(ps, M_SUBPROC);
 }
 
 static struct proc *
 zpfind_locked(pid_t pid)
 {
 	struct proc *p;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	LIST_FOREACH(p, &zombproc, p_list) {
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	}
 	return (p);
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	p = zpfind_locked(pid);
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 /*
  * This function is typically used to copy out the kernel address, so
  * it can be replaced by assignment of zero.
  */
 static inline uint32_t
 ptr32_trim(void *ptr)
 {
 	uintptr_t uptr;
 
 	uptr = (uintptr_t)ptr;
 	return ((uptr > UINT_MAX) ? 0 : uptr);
 }
 
 #define PTRTRIM_CP(src,dst,fld) \
 	do { (dst).fld = ptr32_trim((src).fld); } while (0)
 
 static void
 freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32)
 {
 	int i;
 
 	bzero(ki32, sizeof(struct kinfo_proc32));
 	ki32->ki_structsize = sizeof(struct kinfo_proc32);
 	CP(*ki, *ki32, ki_layout);
 	PTRTRIM_CP(*ki, *ki32, ki_args);
 	PTRTRIM_CP(*ki, *ki32, ki_paddr);
 	PTRTRIM_CP(*ki, *ki32, ki_addr);
 	PTRTRIM_CP(*ki, *ki32, ki_tracep);
 	PTRTRIM_CP(*ki, *ki32, ki_textvp);
 	PTRTRIM_CP(*ki, *ki32, ki_fd);
 	PTRTRIM_CP(*ki, *ki32, ki_vmspace);
 	PTRTRIM_CP(*ki, *ki32, ki_wchan);
 	CP(*ki, *ki32, ki_pid);
 	CP(*ki, *ki32, ki_ppid);
 	CP(*ki, *ki32, ki_pgid);
 	CP(*ki, *ki32, ki_tpgid);
 	CP(*ki, *ki32, ki_sid);
 	CP(*ki, *ki32, ki_tsid);
 	CP(*ki, *ki32, ki_jobc);
 	CP(*ki, *ki32, ki_tdev);
+	CP(*ki, *ki32, ki_tdev_freebsd10);
 	CP(*ki, *ki32, ki_siglist);
 	CP(*ki, *ki32, ki_sigmask);
 	CP(*ki, *ki32, ki_sigignore);
 	CP(*ki, *ki32, ki_sigcatch);
 	CP(*ki, *ki32, ki_uid);
 	CP(*ki, *ki32, ki_ruid);
 	CP(*ki, *ki32, ki_svuid);
 	CP(*ki, *ki32, ki_rgid);
 	CP(*ki, *ki32, ki_svgid);
 	CP(*ki, *ki32, ki_ngroups);
 	for (i = 0; i < KI_NGROUPS; i++)
 		CP(*ki, *ki32, ki_groups[i]);
 	CP(*ki, *ki32, ki_size);
 	CP(*ki, *ki32, ki_rssize);
 	CP(*ki, *ki32, ki_swrss);
 	CP(*ki, *ki32, ki_tsize);
 	CP(*ki, *ki32, ki_dsize);
 	CP(*ki, *ki32, ki_ssize);
 	CP(*ki, *ki32, ki_xstat);
 	CP(*ki, *ki32, ki_acflag);
 	CP(*ki, *ki32, ki_pctcpu);
 	CP(*ki, *ki32, ki_estcpu);
 	CP(*ki, *ki32, ki_slptime);
 	CP(*ki, *ki32, ki_swtime);
 	CP(*ki, *ki32, ki_cow);
 	CP(*ki, *ki32, ki_runtime);
 	TV_CP(*ki, *ki32, ki_start);
 	TV_CP(*ki, *ki32, ki_childtime);
 	CP(*ki, *ki32, ki_flag);
 	CP(*ki, *ki32, ki_kiflag);
 	CP(*ki, *ki32, ki_traceflag);
 	CP(*ki, *ki32, ki_stat);
 	CP(*ki, *ki32, ki_nice);
 	CP(*ki, *ki32, ki_lock);
 	CP(*ki, *ki32, ki_rqindex);
 	CP(*ki, *ki32, ki_oncpu);
 	CP(*ki, *ki32, ki_lastcpu);
 
 	/* XXX TODO: wrap cpu value as appropriate */
 	CP(*ki, *ki32, ki_oncpu_old);
 	CP(*ki, *ki32, ki_lastcpu_old);
 
 	bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
 	bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
 	bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
 	bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
 	bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
 	bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
 	bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
 	CP(*ki, *ki32, ki_tracer);
 	CP(*ki, *ki32, ki_flag2);
 	CP(*ki, *ki32, ki_fibnum);
 	CP(*ki, *ki32, ki_cr_flags);
 	CP(*ki, *ki32, ki_jid);
 	CP(*ki, *ki32, ki_numthreads);
 	CP(*ki, *ki32, ki_tid);
 	CP(*ki, *ki32, ki_pri);
 	freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
 	freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
 	PTRTRIM_CP(*ki, *ki32, ki_pcb);
 	PTRTRIM_CP(*ki, *ki32, ki_kstack);
 	PTRTRIM_CP(*ki, *ki32, ki_udata);
 	CP(*ki, *ki32, ki_sflag);
 	CP(*ki, *ki32, ki_tdflags);
 }
 #endif
 
 int
 kern_proc_out(struct proc *p, struct sbuf *sb, int flags)
 {
 	struct thread *td;
 	struct kinfo_proc ki;
 #ifdef COMPAT_FREEBSD32
 	struct kinfo_proc32 ki32;
 #endif
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
 
 	error = 0;
 	fill_kinfo_proc(p, &ki);
 	if ((flags & KERN_PROC_NOTHREADS) != 0) {
 #ifdef COMPAT_FREEBSD32
 		if ((flags & KERN_PROC_MASK32) != 0) {
 			freebsd32_kinfo_proc_out(&ki, &ki32);
 			if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0)
 				error = ENOMEM;
 		} else
 #endif
 			if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0)
 				error = ENOMEM;
 	} else {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			fill_kinfo_thread(td, &ki, 1);
 #ifdef COMPAT_FREEBSD32
 			if ((flags & KERN_PROC_MASK32) != 0) {
 				freebsd32_kinfo_proc_out(&ki, &ki32);
 				if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0)
 					error = ENOMEM;
 			} else
 #endif
 				if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0)
 					error = ENOMEM;
 			if (error != 0)
 				break;
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags,
     int doingzomb)
 {
 	struct sbuf sb;
 	struct kinfo_proc ki;
 	struct proc *np;
 	int error, error2;
 	pid_t pid;
 
 	pid = p->p_pid;
 	sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req);
 	error = kern_proc_out(p, &sb, flags);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	if (error != 0)
 		return (error);
 	else if (error2 != 0)
 		return (error2);
 	if (doingzomb)
 		np = zpfind(pid);
 	else {
 		if (pid == 0)
 			return (0);
 		np = pfind(pid);
 	}
 	if (np == NULL)
 		return (ESRCH);
 	if (np != p) {
 		PROC_UNLOCK(np);
 		return (ESRCH);
 	}
 	PROC_UNLOCK(np);
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, doingzomb, oid_number;
 	int error = 0;
 
 	oid_number = oidp->oid_number;
 	if (oid_number != KERN_PROC_ALL &&
 	    (oid_number & KERN_PROC_INC_THREAD) == 0)
 		flags = KERN_PROC_NOTHREADS;
 	else {
 		flags = 0;
 		oid_number &= ~KERN_PROC_INC_THREAD;
 	}
 #ifdef COMPAT_FREEBSD32
 	if (req->flags & SCTL_MASK32)
 		flags |= KERN_PROC_MASK32;
 #endif
 	if (oid_number == KERN_PROC_PID) {
 		if (namelen != 1)
 			return (EINVAL);
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);
 		sx_slock(&proctree_lock);
 		error = pget((pid_t)name[0], PGET_CANSEE, &p);
 		if (error == 0)
 			error = sysctl_out_proc(p, req, flags, 0);
 		sx_sunlock(&proctree_lock);
 		return (error);
 	}
 
 	switch (oid_number) {
 	case KERN_PROC_ALL:
 		if (namelen != 0)
 			return (EINVAL);
 		break;
 	case KERN_PROC_PROC:
 		if (namelen != 0 && namelen != 1)
 			return (EINVAL);
 		break;
 	default:
 		if (namelen != 1)
 			return (EINVAL);
 		break;
 	}
 
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_slock(&proctree_lock);
 	sx_slock(&allproc_lock);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Skip embryonic processes.
 			 */
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			KASSERT(p->p_ucred != NULL,
 			    ("process credential is NULL for non-NEW proc"));
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_cansee(curthread, p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oid_number) {
 
 			case KERN_PROC_GID:
 				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL ||
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RGID:
 				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_SESSION:
 				if (p->p_session == NULL ||
 				    p->p_session->s_sid != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				/* XXX proctree_lock */
 				SESS_LOCK(p->p_session);
 				if (p->p_session->s_ttyp == NULL ||
 				    tty_udev(p->p_session->s_ttyp) !=
 				    (dev_t)name[0]) {
 					SESS_UNLOCK(p->p_session);
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_UNLOCK(p->p_session);
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PROC:
 				break;
 
 			default:
 				break;
 
 			}
 
 			error = sysctl_out_proc(p, req, flags, doingzomb);
 			if (error) {
 				sx_sunlock(&allproc_lock);
 				sx_sunlock(&proctree_lock);
 				return (error);
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	sx_sunlock(&proctree_lock);
 	return (0);
 }
 
 struct pargs *
 pargs_alloc(int len)
 {
 	struct pargs *pa;
 
 	pa = malloc(sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
 	refcount_init(&pa->ar_ref, 1);
 	pa->ar_length = len;
 	return (pa);
 }
 
 static void
 pargs_free(struct pargs *pa)
 {
 
 	free(pa, M_PARGS);
 }
 
 void
 pargs_hold(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	refcount_acquire(&pa->ar_ref);
 }
 
 void
 pargs_drop(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	if (refcount_release(&pa->ar_ref))
 		pargs_free(pa);
 }
 
 static int
 proc_read_mem(struct thread *td, struct proc *p, vm_offset_t offset, void* buf,
     size_t len)
 {
 	struct iovec iov;
 	struct uio uio;
 
 	iov.iov_base = (caddr_t)buf;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = (ssize_t)len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	return (proc_rwmem(p, &uio));
 }
 
 static int
 proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf,
     size_t len)
 {
 	size_t i;
 	int error;
 
 	error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, len);
 	/*
 	 * Reading the chunk may validly return EFAULT if the string is shorter
 	 * than the chunk and is aligned at the end of the page, assuming the
 	 * next page is not mapped.  So if EFAULT is returned do a fallback to
 	 * one byte read loop.
 	 */
 	if (error == EFAULT) {
 		for (i = 0; i < len; i++, buf++, sptr++) {
 			error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, 1);
 			if (error != 0)
 				return (error);
 			if (*buf == '\0')
 				break;
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #define PROC_AUXV_MAX	256	/* Safety limit on auxv size. */
 
 enum proc_vector_type {
 	PROC_ARG,
 	PROC_ENV,
 	PROC_AUX,
 };
 
 #ifdef COMPAT_FREEBSD32
 static int
 get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp,
     size_t *vsizep, enum proc_vector_type type)
 {
 	struct freebsd32_ps_strings pss;
 	Elf32_Auxinfo aux;
 	vm_offset_t vptr, ptr;
 	uint32_t *proc_vector32;
 	char **proc_vector;
 	size_t vsize, size;
 	int i, error;
 
 	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
 	    &pss, sizeof(pss));
 	if (error != 0)
 		return (error);
 	switch (type) {
 	case PROC_ARG:
 		vptr = (vm_offset_t)PTRIN(pss.ps_argvstr);
 		vsize = pss.ps_nargvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(int32_t);
 		break;
 	case PROC_ENV:
 		vptr = (vm_offset_t)PTRIN(pss.ps_envstr);
 		vsize = pss.ps_nenvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(int32_t);
 		break;
 	case PROC_AUX:
 		vptr = (vm_offset_t)PTRIN(pss.ps_envstr) +
 		    (pss.ps_nenvstr + 1) * sizeof(int32_t);
 		if (vptr % 4 != 0)
 			return (ENOEXEC);
 		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
 			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
 			if (error != 0)
 				return (error);
 			if (aux.a_type == AT_NULL)
 				break;
 			ptr += sizeof(aux);
 		}
 		if (aux.a_type != AT_NULL)
 			return (ENOEXEC);
 		vsize = i + 1;
 		size = vsize * sizeof(aux);
 		break;
 	default:
 		KASSERT(0, ("Wrong proc vector type: %d", type));
 		return (EINVAL);
 	}
 	proc_vector32 = malloc(size, M_TEMP, M_WAITOK);
 	error = proc_read_mem(td, p, vptr, proc_vector32, size);
 	if (error != 0)
 		goto done;
 	if (type == PROC_AUX) {
 		*proc_vectorp = (char **)proc_vector32;
 		*vsizep = vsize;
 		return (0);
 	}
 	proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK);
 	for (i = 0; i < (int)vsize; i++)
 		proc_vector[i] = PTRIN(proc_vector32[i]);
 	*proc_vectorp = proc_vector;
 	*vsizep = vsize;
 done:
 	free(proc_vector32, M_TEMP);
 	return (error);
 }
 #endif
 
 static int
 get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp,
     size_t *vsizep, enum proc_vector_type type)
 {
 	struct ps_strings pss;
 	Elf_Auxinfo aux;
 	vm_offset_t vptr, ptr;
 	char **proc_vector;
 	size_t vsize, size;
 	int error, i;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(p, SV_ILP32) != 0)
 		return (get_proc_vector32(td, p, proc_vectorp, vsizep, type));
 #endif
 	error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
 	    &pss, sizeof(pss));
 	if (error != 0)
 		return (error);
 	switch (type) {
 	case PROC_ARG:
 		vptr = (vm_offset_t)pss.ps_argvstr;
 		vsize = pss.ps_nargvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(char *);
 		break;
 	case PROC_ENV:
 		vptr = (vm_offset_t)pss.ps_envstr;
 		vsize = pss.ps_nenvstr;
 		if (vsize > ARG_MAX)
 			return (ENOEXEC);
 		size = vsize * sizeof(char *);
 		break;
 	case PROC_AUX:
 		/*
 		 * The aux array is just above env array on the stack. Check
 		 * that the address is naturally aligned.
 		 */
 		vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1)
 		    * sizeof(char *);
 #if __ELF_WORD_SIZE == 64
 		if (vptr % sizeof(uint64_t) != 0)
 #else
 		if (vptr % sizeof(uint32_t) != 0)
 #endif
 			return (ENOEXEC);
 		/*
 		 * We count the array size reading the aux vectors from the
 		 * stack until AT_NULL vector is returned.  So (to keep the code
 		 * simple) we read the process stack twice: the first time here
 		 * to find the size and the second time when copying the vectors
 		 * to the allocated proc_vector.
 		 */
 		for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
 			error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
 			if (error != 0)
 				return (error);
 			if (aux.a_type == AT_NULL)
 				break;
 			ptr += sizeof(aux);
 		}
 		/*
 		 * If the PROC_AUXV_MAX entries are iterated over, and we have
 		 * not reached AT_NULL, it is most likely we are reading wrong
 		 * data: either the process doesn't have auxv array or data has
 		 * been modified. Return the error in this case.
 		 */
 		if (aux.a_type != AT_NULL)
 			return (ENOEXEC);
 		vsize = i + 1;
 		size = vsize * sizeof(aux);
 		break;
 	default:
 		KASSERT(0, ("Wrong proc vector type: %d", type));
 		return (EINVAL); /* In case we are built without INVARIANTS. */
 	}
 	proc_vector = malloc(size, M_TEMP, M_WAITOK);
 	if (proc_vector == NULL)
 		return (ENOMEM);
 	error = proc_read_mem(td, p, vptr, proc_vector, size);
 	if (error != 0) {
 		free(proc_vector, M_TEMP);
 		return (error);
 	}
 	*proc_vectorp = proc_vector;
 	*vsizep = vsize;
 
 	return (0);
 }
 
 #define GET_PS_STRINGS_CHUNK_SZ	256	/* Chunk size (bytes) for ps_strings operations. */
 
 static int
 get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb,
     enum proc_vector_type type)
 {
 	size_t done, len, nchr, vsize;
 	int error, i;
 	char **proc_vector, *sptr;
 	char pss_string[GET_PS_STRINGS_CHUNK_SZ];
 
 	PROC_ASSERT_HELD(p);
 
 	/*
 	 * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes.
 	 */
 	nchr = 2 * (PATH_MAX + ARG_MAX);
 
 	error = get_proc_vector(td, p, &proc_vector, &vsize, type);
 	if (error != 0)
 		return (error);
 	for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) {
 		/*
 		 * The program may have scribbled into its argv array, e.g. to
 		 * remove some arguments.  If that has happened, break out
 		 * before trying to read from NULL.
 		 */
 		if (proc_vector[i] == NULL)
 			break;
 		for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) {
 			error = proc_read_string(td, p, sptr, pss_string,
 			    sizeof(pss_string));
 			if (error != 0)
 				goto done;
 			len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ);
 			if (done + len >= nchr)
 				len = nchr - done - 1;
 			sbuf_bcat(sb, pss_string, len);
 			if (len != GET_PS_STRINGS_CHUNK_SZ)
 				break;
 			done += GET_PS_STRINGS_CHUNK_SZ;
 		}
 		sbuf_bcat(sb, "", 1);
 		done += len + 1;
 	}
 done:
 	free(proc_vector, M_TEMP);
 	return (error);
 }
 
 int
 proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 
 	return (get_ps_strings(curthread, p, sb, PROC_ARG));
 }
 
 int
 proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 
 	return (get_ps_strings(curthread, p, sb, PROC_ENV));
 }
 
 int
 proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb)
 {
 	size_t vsize, size;
 	char **auxv;
 	int error;
 
 	error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX);
 	if (error == 0) {
 #ifdef COMPAT_FREEBSD32
 		if (SV_PROC_FLAG(p, SV_ILP32) != 0)
 			size = vsize * sizeof(Elf32_Auxinfo);
 		else
 #endif
 			size = vsize * sizeof(Elf_Auxinfo);
 		if (sbuf_bcat(sb, auxv, size) != 0)
 			error = ENOMEM;
 		free(auxv, M_TEMP);
 	}
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct pargs *newpa, *pa;
 	struct proc *p;
 	struct sbuf sb;
 	int flags, error = 0, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	flags = PGET_CANSEE;
 	if (req->newptr != NULL)
 		flags |= PGET_ISCURRENT;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error)
 		return (error);
 
 	pa = p->p_args;
 	if (pa != NULL) {
 		pargs_hold(pa);
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
 		pargs_drop(pa);
 	} else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) {
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 		sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 		error = proc_getargv(curthread, p, &sb);
 		error2 = sbuf_finish(&sb);
 		PRELE(p);
 		sbuf_delete(&sb);
 		if (error == 0 && error2 != 0)
 			error = error2;
 	} else {
 		PROC_UNLOCK(p);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (ENOMEM);
 	newpa = pargs_alloc(req->newlen);
 	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
 	if (error != 0) {
 		pargs_free(newpa);
 		return (error);
 	}
 	PROC_LOCK(p);
 	pa = p->p_args;
 	p->p_args = newpa;
 	PROC_UNLOCK(p);
 	pargs_drop(pa);
 	return (0);
 }
 
 /*
  * This sysctl allows a process to retrieve environment of another process.
  */
 static int
 sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		PRELE(p);
 		return (0);
 	}
 
 	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 	error = proc_getenvv(curthread, p, &sb);
 	error2 = sbuf_finish(&sb);
 	PRELE(p);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 /*
  * This sysctl allows a process to retrieve ELF auxiliary vector of
  * another process.
  */
 static int
 sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		PRELE(p);
 		return (0);
 	}
 	sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
 	error = proc_getauxv(curthread, p, &sb);
 	error2 = sbuf_finish(&sb);
 	PRELE(p);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 /*
  * This sysctl allows a process to retrieve the path of the executable for
  * itself or another process.
  */
 static int
 sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct vnode *vp;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 	} else {
 		error = pget(*pidp, PGET_CANSEE, &p);
 		if (error != 0)
 			return (error);
 	}
 
 	vp = p->p_textvp;
 	if (vp == NULL) {
 		if (*pidp != -1)
 			PROC_UNLOCK(p);
 		return (0);
 	}
 	vref(vp);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
 	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
 	vrele(vp);
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
 	free(freebuf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	char *sv_name;
 	int *name;
 	int namelen;
 	int error;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANSEE, &p);
 	if (error != 0)
 		return (error);
 	sv_name = p->p_sysent->sv_name;
 	PROC_UNLOCK(p);
 	return (sysctl_handle_string(oidp, sv_name, 0, req));
 }
 
 #ifdef KINFO_OVMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static int
 sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
 {
 	vm_map_entry_t entry, tmp_entry;
 	unsigned int last_timestamp;
 	char *fullpath, *freepath;
 	struct kinfo_ovmentry *kve;
 	struct vattr va;
 	struct ucred *cred;
 	int error, *name;
 	struct vnode *vp;
 	struct proc *p;
 	vm_map_t map;
 	struct vmspace *vm;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 	vm = vmspace_acquire_ref(p);
 	if (vm == NULL) {
 		PRELE(p);
 		return (ESRCH);
 	}
 	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
 
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		vm_object_t obj, tobj, lobj;
 		vm_offset_t addr;
 
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		bzero(kve, sizeof(*kve));
 		kve->kve_structsize = sizeof(*kve);
 
 		kve->kve_private_resident = 0;
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			VM_OBJECT_RLOCK(obj);
 			if (obj->shadow_count == 1)
 				kve->kve_private_resident =
 				    obj->resident_page_count;
 		}
 		kve->kve_resident = 0;
 		addr = entry->start;
 		while (addr < entry->end) {
 			if (pmap_extract(map->pmap, addr))
 				kve->kve_resident++;
 			addr += PAGE_SIZE;
 		}
 
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 		}
 
 		kve->kve_start = (void*)entry->start;
 		kve->kve_end = (void*)entry->end;
 		kve->kve_offset = (off_t)entry->offset;
 
 		if (entry->protection & VM_PROT_READ)
 			kve->kve_protection |= KVME_PROT_READ;
 		if (entry->protection & VM_PROT_WRITE)
 			kve->kve_protection |= KVME_PROT_WRITE;
 		if (entry->protection & VM_PROT_EXECUTE)
 			kve->kve_protection |= KVME_PROT_EXEC;
 
 		if (entry->eflags & MAP_ENTRY_COW)
 			kve->kve_flags |= KVME_FLAG_COW;
 		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 
 		kve->kve_fileid = 0;
 		kve->kve_fsid = 0;
 		freepath = NULL;
 		fullpath = "";
 		if (lobj) {
 			vp = NULL;
 			switch (lobj->type) {
 			case OBJT_DEFAULT:
 				kve->kve_type = KVME_TYPE_DEFAULT;
 				break;
 			case OBJT_VNODE:
 				kve->kve_type = KVME_TYPE_VNODE;
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				kve->kve_type = KVME_TYPE_SWAP;
 				break;
 			case OBJT_DEVICE:
 				kve->kve_type = KVME_TYPE_DEVICE;
 				break;
 			case OBJT_PHYS:
 				kve->kve_type = KVME_TYPE_PHYS;
 				break;
 			case OBJT_DEAD:
 				kve->kve_type = KVME_TYPE_DEAD;
 				break;
 			case OBJT_SG:
 				kve->kve_type = KVME_TYPE_SG;
 				break;
 			default:
 				kve->kve_type = KVME_TYPE_UNKNOWN;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 
 			kve->kve_ref_count = obj->ref_count;
 			kve->kve_shadow_count = obj->shadow_count;
 			VM_OBJECT_RUNLOCK(obj);
 			if (vp != NULL) {
 				vn_fullpath(curthread, vp, &fullpath,
 				    &freepath);
 				cred = curthread->td_ucred;
 				vn_lock(vp, LK_SHARED | LK_RETRY);
 				if (VOP_GETATTR(vp, &va, cred) == 0) {
 					kve->kve_fileid = va.va_fileid;
+					/* truncate */
 					kve->kve_fsid = va.va_fsid;
 				}
 				vput(vp);
 			}
 		} else {
 			kve->kve_type = KVME_TYPE_NONE;
 			kve->kve_ref_count = 0;
 			kve->kve_shadow_count = 0;
 		}
 
 		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		error = SYSCTL_OUT(req, kve, sizeof(*kve));
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp != map->timestamp) {
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	vmspace_free(vm);
 	PRELE(p);
 	free(kve, M_TEMP);
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD7 */
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 static void
 kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t entry,
     struct kinfo_vmentry *kve)
 {
 	vm_object_t obj, tobj;
 	vm_page_t m, m_adv;
 	vm_offset_t addr;
 	vm_paddr_t locked_pa;
 	vm_pindex_t pi, pi_adv, pindex;
 
 	locked_pa = 0;
 	obj = entry->object.vm_object;
 	addr = entry->start;
 	m_adv = NULL;
 	pi = OFF_TO_IDX(entry->offset);
 	for (; addr < entry->end; addr += IDX_TO_OFF(pi_adv), pi += pi_adv) {
 		if (m_adv != NULL) {
 			m = m_adv;
 		} else {
 			pi_adv = OFF_TO_IDX(entry->end - addr);
 			pindex = pi;
 			for (tobj = obj;; tobj = tobj->backing_object) {
 				m = vm_page_find_least(tobj, pindex);
 				if (m != NULL) {
 					if (m->pindex == pindex)
 						break;
 					if (pi_adv > m->pindex - pindex) {
 						pi_adv = m->pindex - pindex;
 						m_adv = m;
 					}
 				}
 				if (tobj->backing_object == NULL)
 					goto next;
 				pindex += OFF_TO_IDX(tobj->
 				    backing_object_offset);
 			}
 		}
 		m_adv = NULL;
 		if (m->psind != 0 && addr + pagesizes[1] <= entry->end &&
 		    (addr & (pagesizes[1] - 1)) == 0 &&
 		    (pmap_mincore(map->pmap, addr, &locked_pa) &
 		    MINCORE_SUPER) != 0) {
 			kve->kve_flags |= KVME_FLAG_SUPER;
 			pi_adv = OFF_TO_IDX(pagesizes[1]);
 		} else {
 			/*
 			 * We do not test the found page on validity.
 			 * Either the page is busy and being paged in,
 			 * or it was invalidated.  The first case
 			 * should be counted as resident, the second
 			 * is not so clear; we do account both.
 			 */
 			pi_adv = 1;
 		}
 		kve->kve_resident += pi_adv;
 next:;
 	}
 	PA_UNLOCK_COND(locked_pa);
 }
 
 /*
  * Must be called with the process locked and will return unlocked.
  */
 int
 kern_proc_vmmap_out(struct proc *p, struct sbuf *sb)
 {
 	vm_map_entry_t entry, tmp_entry;
 	struct vattr va;
 	vm_map_t map;
 	vm_object_t obj, tobj, lobj;
 	char *fullpath, *freepath;
 	struct kinfo_vmentry *kve;
 	struct ucred *cred;
 	struct vnode *vp;
 	struct vmspace *vm;
 	vm_offset_t addr;
 	unsigned int last_timestamp;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	_PHOLD(p);
 	PROC_UNLOCK(p);
 	vm = vmspace_acquire_ref(p);
 	if (vm == NULL) {
 		PRELE(p);
 		return (ESRCH);
 	}
 	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
 
 	error = 0;
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 
 		addr = entry->end;
 		bzero(kve, sizeof(*kve));
 		obj = entry->object.vm_object;
 		if (obj != NULL) {
 			for (tobj = obj; tobj != NULL;
 			    tobj = tobj->backing_object) {
 				VM_OBJECT_RLOCK(tobj);
 				lobj = tobj;
 			}
 			if (obj->backing_object == NULL)
 				kve->kve_private_resident =
 				    obj->resident_page_count;
 			if (!vmmap_skip_res_cnt)
 				kern_proc_vmmap_resident(map, entry, kve);
 			for (tobj = obj; tobj != NULL;
 			    tobj = tobj->backing_object) {
 				if (tobj != obj && tobj != lobj)
 					VM_OBJECT_RUNLOCK(tobj);
 			}
 		} else {
 			lobj = NULL;
 		}
 
 		kve->kve_start = entry->start;
 		kve->kve_end = entry->end;
 		kve->kve_offset = entry->offset;
 
 		if (entry->protection & VM_PROT_READ)
 			kve->kve_protection |= KVME_PROT_READ;
 		if (entry->protection & VM_PROT_WRITE)
 			kve->kve_protection |= KVME_PROT_WRITE;
 		if (entry->protection & VM_PROT_EXECUTE)
 			kve->kve_protection |= KVME_PROT_EXEC;
 
 		if (entry->eflags & MAP_ENTRY_COW)
 			kve->kve_flags |= KVME_FLAG_COW;
 		if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
 		if (entry->eflags & MAP_ENTRY_GROWS_UP)
 			kve->kve_flags |= KVME_FLAG_GROWS_UP;
 		if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
 			kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 
 		freepath = NULL;
 		fullpath = "";
 		if (lobj != NULL) {
 			vp = NULL;
 			switch (lobj->type) {
 			case OBJT_DEFAULT:
 				kve->kve_type = KVME_TYPE_DEFAULT;
 				break;
 			case OBJT_VNODE:
 				kve->kve_type = KVME_TYPE_VNODE;
 				vp = lobj->handle;
 				vref(vp);
 				break;
 			case OBJT_SWAP:
 				kve->kve_type = KVME_TYPE_SWAP;
 				break;
 			case OBJT_DEVICE:
 				kve->kve_type = KVME_TYPE_DEVICE;
 				break;
 			case OBJT_PHYS:
 				kve->kve_type = KVME_TYPE_PHYS;
 				break;
 			case OBJT_DEAD:
 				kve->kve_type = KVME_TYPE_DEAD;
 				break;
 			case OBJT_SG:
 				kve->kve_type = KVME_TYPE_SG;
 				break;
 			case OBJT_MGTDEVICE:
 				kve->kve_type = KVME_TYPE_MGTDEVICE;
 				break;
 			default:
 				kve->kve_type = KVME_TYPE_UNKNOWN;
 				break;
 			}
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 
 			kve->kve_ref_count = obj->ref_count;
 			kve->kve_shadow_count = obj->shadow_count;
 			VM_OBJECT_RUNLOCK(obj);
 			if (vp != NULL) {
 				vn_fullpath(curthread, vp, &fullpath,
 				    &freepath);
 				kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
 				cred = curthread->td_ucred;
 				vn_lock(vp, LK_SHARED | LK_RETRY);
 				if (VOP_GETATTR(vp, &va, cred) == 0) {
 					kve->kve_vn_fileid = va.va_fileid;
 					kve->kve_vn_fsid = va.va_fsid;
+					kve->kve_vn_fsid_freebsd10 =
+					    kve->kve_vn_fsid; /* truncate */
 					kve->kve_vn_mode =
 					    MAKEIMODE(va.va_type, va.va_mode);
+					kve->kve_vn_mode_freebsd10 =
+					    kve->kve_vn_mode; /* truncate */
 					kve->kve_vn_size = va.va_size;
 					kve->kve_vn_rdev = va.va_rdev;
+					kve->kve_vn_rdev_freebsd10 =
+					    kve->kve_vn_rdev; /* truncate */
 					kve->kve_status = KF_ATTR_VALID;
 				}
 				vput(vp);
 			}
 		} else {
 			kve->kve_type = KVME_TYPE_NONE;
 			kve->kve_ref_count = 0;
 			kve->kve_shadow_count = 0;
 		}
 
 		strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
 		    strlen(kve->kve_path) + 1;
 		kve->kve_structsize = roundup(kve->kve_structsize,
 		    sizeof(uint64_t));
 		if (sbuf_bcat(sb, kve, kve->kve_structsize) != 0)
 			error = ENOMEM;
 		vm_map_lock_read(map);
 		if (error != 0)
 			break;
 		if (last_timestamp != map->timestamp) {
 			vm_map_lookup_entry(map, addr - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 	vmspace_free(vm);
 	PRELE(p);
 	free(kve, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	struct sbuf sb;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	error = kern_proc_vmmap_out(p, &sb);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #if defined(STACK) || defined(DDB)
 static int
 sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_kstack *kkstp;
 	int error, i, *name, numthreads;
 	lwpid_t *lwpidarray;
 	struct thread *td;
 	struct stack *st;
 	struct sbuf sb;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 
 	kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
 	st = stack_create();
 
 	lwpidarray = NULL;
 	numthreads = 0;
 	PROC_LOCK(p);
 repeat:
 	if (numthreads < p->p_numthreads) {
 		if (lwpidarray != NULL) {
 			free(lwpidarray, M_TEMP);
 			lwpidarray = NULL;
 		}
 		numthreads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
 		    M_WAITOK | M_ZERO);
 		PROC_LOCK(p);
 		goto repeat;
 	}
 	i = 0;
 
 	/*
 	 * XXXRW: During the below loop, execve(2) and countless other sorts
 	 * of changes could have taken place.  Should we check to see if the
 	 * vmspace has been replaced, or the like, in order to prevent
 	 * giving a snapshot that spans, say, execve(2), with some threads
 	 * before and some after?  Among other things, the credentials could
 	 * have changed, in which case the right to extract debug info might
 	 * no longer be assured.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(i < numthreads,
 		    ("sysctl_kern_proc_kstack: numthreads"));
 		lwpidarray[i] = td->td_tid;
 		i++;
 	}
 	numthreads = i;
 	for (i = 0; i < numthreads; i++) {
 		td = thread_find(p, lwpidarray[i]);
 		if (td == NULL) {
 			continue;
 		}
 		bzero(kkstp, sizeof(*kkstp));
 		(void)sbuf_new(&sb, kkstp->kkst_trace,
 		    sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
 		thread_lock(td);
 		kkstp->kkst_tid = td->td_tid;
 		if (TD_IS_SWAPPED(td))
 			kkstp->kkst_state = KKST_STATE_SWAPPED;
 		else if (TD_IS_RUNNING(td))
 			kkstp->kkst_state = KKST_STATE_RUNNING;
 		else {
 			kkstp->kkst_state = KKST_STATE_STACKOK;
 			stack_save_td(st, td);
 		}
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		stack_sbuf_print(&sb, st);
 		sbuf_finish(&sb);
 		sbuf_delete(&sb);
 		error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
 		PROC_LOCK(p);
 		if (error)
 			break;
 	}
 	_PRELE(p);
 	PROC_UNLOCK(p);
 	if (lwpidarray != NULL)
 		free(lwpidarray, M_TEMP);
 	stack_destroy(st);
 	free(kkstp, M_TEMP);
 	return (error);
 }
 #endif
 
 /*
  * This sysctl allows a process to retrieve the full list of groups from
  * itself or another process.
  */
 static int
 sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct ucred *cred;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		error = pget(*pidp, PGET_CANSEE, &p);
 		if (error != 0)
 			return (error);
 	}
 
 	cred = crhold(p->p_ucred);
 	PROC_UNLOCK(p);
 
 	error = SYSCTL_OUT(req, cred->cr_groups,
 	    cred->cr_ngroups * sizeof(gid_t));
 	crfree(cred);
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve or/and set the resource limit for
  * another process.
  */
 static int
 sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct rlimit rlim;
 	struct proc *p;
 	u_int which;
 	int flags, error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	which = (u_int)name[1];
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	if (req->newptr != NULL && req->newlen != sizeof(rlim))
 		return (EINVAL);
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (req->newptr != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve limit.
 	 */
 	if (req->oldptr != NULL) {
 		PROC_LOCK(p);
 		lim_rlimit(p, which, &rlim);
 		PROC_UNLOCK(p);
 	}
 	error = SYSCTL_OUT(req, &rlim, sizeof(rlim));
 	if (error != 0)
 		goto errout;
 
 	/*
 	 * Set limit.
 	 */
 	if (req->newptr != NULL) {
 		error = SYSCTL_IN(req, &rlim, sizeof(rlim));
 		if (error == 0)
 			error = kern_proc_setrlimit(curthread, p, which, &rlim);
 	}
 
 errout:
 	PRELE(p);
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve ps_strings structure location of
  * another process.
  */
 static int
 sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	vm_offset_t ps_strings;
 	int error;
 #ifdef COMPAT_FREEBSD32
 	uint32_t ps_strings32;
 #endif
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	if ((req->flags & SCTL_MASK32) != 0) {
 		/*
 		 * We return 0 if the 32 bit emulation request is for a 64 bit
 		 * process.
 		 */
 		ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ?
 		    PTROUT(p->p_sysent->sv_psstrings) : 0;
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32));
 		return (error);
 	}
 #endif
 	ps_strings = p->p_sysent->sv_psstrings;
 	PROC_UNLOCK(p);
 	error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings));
 	return (error);
 }
 
 /*
  * This sysctl allows a process to retrieve umask of another process.
  */
 static int
 sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int error;
 	u_short fd_cmask;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_WANTREAD, &p);
 	if (error != 0)
 		return (error);
 
 	FILEDESC_SLOCK(p->p_fd);
 	fd_cmask = p->p_fd->fd_cmask;
 	FILEDESC_SUNLOCK(p->p_fd);
 	PRELE(p);
 	error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask));
 	return (error);
 }
 
 /*
  * This sysctl allows a process to set and retrieve binary osreldate of
  * another process.
  */
 static int
 sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, error, osrel;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (req->newptr != NULL && req->newlen != sizeof(osrel))
 		return (EINVAL);
 
 	flags = PGET_HOLD | PGET_NOTWEXIT;
 	if (req->newptr != NULL)
 		flags |= PGET_CANDEBUG;
 	else
 		flags |= PGET_CANSEE;
 	error = pget((pid_t)name[0], flags, &p);
 	if (error != 0)
 		return (error);
 
 	error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel));
 	if (error != 0)
 		goto errout;
 
 	if (req->newptr != NULL) {
 		error = SYSCTL_IN(req, &osrel, sizeof(osrel));
 		if (error != 0)
 			goto errout;
 		if (osrel < 0) {
 			error = EINVAL;
 			goto errout;
 		}
 		p->p_osrel = osrel;
 	}
 errout:
 	PRELE(p);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_sigtramp(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct kinfo_sigtramp kst;
 	const struct sysentvec *sv;
 	int error;
 #ifdef COMPAT_FREEBSD32
 	struct kinfo_sigtramp32 kst32;
 #endif
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
 	if (error != 0)
 		return (error);
 	sv = p->p_sysent;
 #ifdef COMPAT_FREEBSD32
 	if ((req->flags & SCTL_MASK32) != 0) {
 		bzero(&kst32, sizeof(kst32));
 		if (SV_PROC_FLAG(p, SV_ILP32)) {
 			if (sv->sv_sigcode_base != 0) {
 				kst32.ksigtramp_start = sv->sv_sigcode_base;
 				kst32.ksigtramp_end = sv->sv_sigcode_base +
 				    *sv->sv_szsigcode;
 			} else {
 				kst32.ksigtramp_start = sv->sv_psstrings -
 				    *sv->sv_szsigcode;
 				kst32.ksigtramp_end = sv->sv_psstrings;
 			}
 		}
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, &kst32, sizeof(kst32));
 		return (error);
 	}
 #endif
 	bzero(&kst, sizeof(kst));
 	if (sv->sv_sigcode_base != 0) {
 		kst.ksigtramp_start = (char *)sv->sv_sigcode_base;
 		kst.ksigtramp_end = (char *)sv->sv_sigcode_base +
 		    *sv->sv_szsigcode;
 	} else {
 		kst.ksigtramp_start = (char *)sv->sv_psstrings -
 		    *sv->sv_szsigcode;
 		kst.ksigtramp_end = (char *)sv->sv_psstrings;
 	}
 	PROC_UNLOCK(p);
 	error = SYSCTL_OUT(req, &kst, sizeof(kst));
 	return (error);
 }
 
 SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT|
 	CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
 	"Return entire process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc, "Return process table, no threads");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
 	CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE,
 	sysctl_kern_proc_args, "Process argument list");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE,
 	sysctl_kern_proc_env, "Process environment");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
 	"Process syscall vector name (ABI type)");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
 	sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
 	CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc,
 	"Return process table, no threads");
 
 #ifdef COMPAT_FREEBSD7
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
 #endif
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");
 
 #if defined(STACK) || defined(DDB)
 static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
 #endif
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW |
 	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit,
 	"Process resource limits");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings,
 	"Process ps_strings location");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW |
 	CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel,
 	"Process binary osreldate");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGTRAMP, sigtramp, CTLFLAG_RD |
 	CTLFLAG_MPSAFE, sysctl_kern_proc_sigtramp,
 	"Process signal trampoline location");
 
 int allproc_gen;
 
 void
 stop_all_proc(void)
 {
 	struct proc *cp, *p;
 	int r, gen;
 	bool restart, seen_stopped, seen_exiting, stopped_some;
 
 	cp = curproc;
 	/*
 	 * stop_all_proc() assumes that all process which have
 	 * usermode must be stopped, except current process, for
 	 * obvious reasons.  Since other threads in the process
 	 * establishing global stop could unstop something, disable
 	 * calls from multithreaded processes as precaution.  The
 	 * service must not be user-callable anyway.
 	 */
 	KASSERT((cp->p_flag & P_HADTHREADS) == 0 ||
 	    (cp->p_flag & P_KTHREAD) != 0, ("mt stop_all_proc"));
 
 allproc_loop:
 	sx_xlock(&allproc_lock);
 	gen = allproc_gen;
 	seen_exiting = seen_stopped = stopped_some = restart = false;
 	LIST_REMOVE(cp, p_list);
 	LIST_INSERT_HEAD(&allproc, cp, p_list);
 	for (;;) {
 		p = LIST_NEXT(cp, p_list);
 		if (p == NULL)
 			break;
 		LIST_REMOVE(cp, p_list);
 		LIST_INSERT_AFTER(p, cp, p_list);
 		PROC_LOCK(p);
 		if ((p->p_flag & (P_KTHREAD | P_SYSTEM |
 		    P_TOTAL_STOP)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if ((p->p_flag & P_WEXIT) != 0) {
 			seen_exiting = true;
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			/*
 			 * Stopped processes are tolerated when there
 			 * are no other processes which might continue
 			 * them.  P_STOPPED_SINGLE but not
 			 * P_TOTAL_STOP process still has at least one
 			 * thread running.
 			 */
 			seen_stopped = true;
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD(p);
 		sx_xunlock(&allproc_lock);
 		r = thread_single(p, SINGLE_ALLPROC);
 		if (r != 0)
 			restart = true;
 		else
 			stopped_some = true;
 		_PRELE(p);
 		PROC_UNLOCK(p);
 		sx_xlock(&allproc_lock);
 	}
 	/* Catch forked children we did not see in iteration. */
 	if (gen != allproc_gen)
 		restart = true;
 	sx_xunlock(&allproc_lock);
 	if (restart || stopped_some || seen_exiting || seen_stopped) {
 		kern_yield(PRI_USER);
 		goto allproc_loop;
 	}
 }
 
 void
 resume_all_proc(void)
 {
 	struct proc *cp, *p;
 
 	cp = curproc;
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(cp, p_list);
 	LIST_INSERT_HEAD(&allproc, cp, p_list);
 	for (;;) {
 		p = LIST_NEXT(cp, p_list);
 		if (p == NULL)
 			break;
 		LIST_REMOVE(cp, p_list);
 		LIST_INSERT_AFTER(p, cp, p_list);
 		PROC_LOCK(p);
 		if ((p->p_flag & P_TOTAL_STOP) != 0) {
 			sx_xunlock(&allproc_lock);
 			_PHOLD(p);
 			thread_single_end(p, SINGLE_ALLPROC);
 			_PRELE(p);
 			PROC_UNLOCK(p);
 			sx_xlock(&allproc_lock);
 		} else {
 			PROC_UNLOCK(p);
 		}
 	}
 	sx_xunlock(&allproc_lock);
 }
 
 #define	TOTAL_STOP_DEBUG	1
 #ifdef TOTAL_STOP_DEBUG
 volatile static int ap_resume;
 #include <sys/mount.h>
 
 static int
 sysctl_debug_stop_all_proc(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = 0;
 	ap_resume = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0) {
 		stop_all_proc();
 		syncer_suspend();
 		while (ap_resume == 0)
 			;
 		syncer_resume();
 		resume_all_proc();
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, stop_all_proc, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, __DEVOLATILE(int *, &ap_resume), 0,
     sysctl_debug_stop_all_proc, "I",
     "");
 #endif
Index: projects/ino64/sys/kern/sys_socket.c
===================================================================
--- projects/ino64/sys/kern/sys_socket.c	(revision 276110)
+++ projects/ino64/sys/kern/sys_socket.c	(revision 276111)
@@ -1,365 +1,367 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/ucred.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 static fo_rdwr_t soo_read;
 static fo_rdwr_t soo_write;
 static fo_ioctl_t soo_ioctl;
 static fo_poll_t soo_poll;
 extern fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
 static fo_fill_kinfo_t soo_fill_kinfo;
 
 struct fileops	socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = soo_ioctl,
 	.fo_poll = soo_poll,
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = soo_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static int
 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_receive(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = soreceive(so, 0, uio, 0, 0, 0);
 	return (error);
 }
 
 static int
 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_send(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
 	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 		PROC_LOCK(uio->uio_td->td_proc);
 		tdsignal(uio->uio_td, SIGPIPE);
 		PROC_UNLOCK(uio->uio_td->td_proc);
 	}
 	return (error);
 }
 
 static int
 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	switch (cmd) {
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		SOCK_UNLOCK(so);
 		break;
 
 	case FIOASYNC:
 		/*
 		 * XXXRW: This code separately acquires SOCK_LOCK(so) and
 		 * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
 		 * mutex to avoid introducing the assumption that they are
 		 * the same.
 		 */
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
 			SOCK_UNLOCK(so);
 			SOCKBUF_LOCK(&so->so_rcv);
 			so->so_rcv.sb_flags |= SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			SOCKBUF_LOCK(&so->so_snd);
 			so->so_snd.sb_flags |= SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
 			SOCK_UNLOCK(so);
 			SOCKBUF_LOCK(&so->so_rcv);
 			so->so_rcv.sb_flags &= ~SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			SOCKBUF_LOCK(&so->so_snd);
 			so->so_snd.sb_flags &= ~SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 		break;
 
 	case FIONREAD:
 		/* Unlocked read. */
 		*(int *)data = sbavail(&so->so_rcv);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
 		*(int *)data = sbavail(&so->so_snd);
 		break;
 
 	case FIONSPACE:
 		/* Unlocked read. */
 		if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
 		    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
 			*(int *)data = 0;
 		else
 			*(int *)data = sbspace(&so->so_snd);
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &so->so_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&so->so_sigio);
 		break;
 
 	case SIOCSPGRP:
 		error = fsetown(-(*(int *)data), &so->so_sigio);
 		break;
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(&so->so_sigio);
 		break;
 
 	case SIOCATMARK:
 		/* Unlocked read. */
 		*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
 		break;
 	default:
 		/*
 		 * Interface/routing/protocol specific ioctls: interface and
 		 * routing ioctls should have a different entry since a
 		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
 		else if (IOCGROUP(cmd) == 'r') {
 			CURVNET_SET(so->so_vnet);
 			error = rtioctl_fib(cmd, data, so->so_fibnum);
 			CURVNET_RESTORE();
 		} else {
 			CURVNET_SET(so->so_vnet);
 			error = ((*so->so_proto->pr_usrreqs->pru_control)
 			    (so, cmd, data, 0, td));
 			CURVNET_RESTORE();
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 soo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 #ifdef MAC
 	int error;
 
 	error = mac_socket_check_poll(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	return (sopoll(so, events, fp->f_cred, td));
 }
 
 static int
 soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	struct sockbuf *sb;
 #ifdef MAC
 	int error;
 #endif
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 #ifdef MAC
 	error = mac_socket_check_stat(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	/*
 	 * If SBS_CANTRCVMORE is set, but there's still data left in the
 	 * receive buffer, the socket is still readable.
 	 */
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 	ub->st_size = sbavail(sb) - sb->sb_ctl;
 	SOCKBUF_UNLOCK(sb);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 	SOCKBUF_UNLOCK(sb);
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the socket
  * (including initiating closing protocols).  soclose() will sorele() the
  * file reference but the actual socket will not go away until the socket's
  * ref count hits 0.
  */
 static int
 soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
 	return (error);
 }
 
 static int
 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	struct socket *so;
 	int error;
 
 	kif->kf_type = KF_TYPE_SOCKET;
 	so = fp->f_data;
 	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
 	kif->kf_sock_type = so->so_type;
 	kif->kf_sock_protocol = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch (kif->kf_sock_domain) {
 	case AF_INET:
 	case AF_INET6:
 		if (kif->kf_sock_protocol == IPPROTO_TCP) {
 			if (so->so_pcb != NULL) {
 				inpcb = (struct inpcb *)(so->so_pcb);
 				kif->kf_un.kf_sock.kf_sock_inpcb =
 				    (uintptr_t)inpcb->inp_ppcb;
 			}
 		}
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 			}
 		}
 		break;
 	}
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
-	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
-		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
+	if (error == 0 &&
+	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) {
+		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
-	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
-		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
+	if (error == 0 &&
+	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) {
+		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	return (0);	
 }
Index: projects/ino64/sys/kern/tty_pts.c
===================================================================
--- projects/ino64/sys/kern/tty_pts.c	(revision 276110)
+++ projects/ino64/sys/kern/tty_pts.c	(revision 276111)
@@ -1,867 +1,869 @@
 /*-
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Add compatibility bits for FreeBSD. */
 #define PTS_COMPAT
 /* Add pty(4) compat bits. */
 #define PTS_EXTERNAL
 /* Add bits to make Linux binaries work. */
 #define PTS_LINUX
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/serial.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #include <sys/user.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Our utmp(5) format is limited to 8-byte TTY line names.  This means
  * we can at most allocate 1000 pseudo-terminals ("pts/999").  Allow
  * users to increase this number, assuming they have manually increased
  * UT_LINESIZE.
  */
 static struct unrhdr *pts_pool;
 
 static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
 
 /*
  * Per-PTS structure.
  *
  * List of locks
  * (t)	locked by tty_lock()
  * (c)	const until freeing
  */
 struct pts_softc {
 	int		pts_unit;	/* (c) Device unit number. */
 	unsigned int	pts_flags;	/* (t) Device flags. */
 #define	PTS_PKT		0x1	/* Packet mode. */
 #define	PTS_FINISHED	0x2	/* Return errors on read()/write(). */
 	char		pts_pkt;	/* (t) Unread packet mode data. */
 
 	struct cv	pts_inwait;	/* (t) Blocking write() on master. */
 	struct selinfo	pts_inpoll;	/* (t) Select queue for write(). */
 	struct cv	pts_outwait;	/* (t) Blocking read() on master. */
 	struct selinfo	pts_outpoll;	/* (t) Select queue for read(). */
 
 #ifdef PTS_EXTERNAL
 	struct cdev	*pts_cdev;	/* (c) Master device node. */
 #endif /* PTS_EXTERNAL */
 
 	struct ucred	*pts_cred;	/* (c) Resource limit. */
 };
 
 /*
  * Controller-side file operations.
  */
 
 static int
 ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 	char pkt;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	tty_lock(tp);
 
 	for (;;) {
 		/*
 		 * Implement packet mode. When packet mode is turned on,
 		 * the first byte contains a bitmask of events that
 		 * occured (start, stop, flush, window size, etc).
 		 */
 		if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
 			pkt = psc->pts_pkt;
 			psc->pts_pkt = 0;
 			tty_unlock(tp);
 
 			error = ureadc(pkt, uio);
 			return (error);
 		}
 
 		/*
 		 * Transmit regular data.
 		 *
 		 * XXX: We shouldn't use ttydisc_getc_poll()! Even
 		 * though in this implementation, there is likely going
 		 * to be data, we should just call ttydisc_getc_uio()
 		 * and use its return value to sleep.
 		 */
 		if (ttydisc_getc_poll(tp)) {
 			if (psc->pts_flags & PTS_PKT) {
 				/*
 				 * XXX: Small race. Fortunately PTY
 				 * consumers aren't multithreaded.
 				 */
 
 				tty_unlock(tp);
 				error = ureadc(TIOCPKT_DATA, uio);
 				if (error)
 					return (error);
 				tty_lock(tp);
 			}
 
 			error = ttydisc_getc_uio(tp, uio);
 			break;
 		}
 
 		/* Maybe the device isn't used anyway. */
 		if (psc->pts_flags & PTS_FINISHED)
 			break;
 
 		/* Wait for more data. */
 		if (fp->f_flag & O_NONBLOCK) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
 		if (error != 0)
 			break;
 	}
 
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	char ib[256], *ibstart;
 	size_t iblen, rintlen;
 	int error = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	for (;;) {
 		ibstart = ib;
 		iblen = MIN(uio->uio_resid, sizeof ib);
 		error = uiomove(ib, iblen, uio);
 
 		tty_lock(tp);
 		if (error != 0) {
 			iblen = 0;
 			goto done;
 		}
 
 		/*
 		 * When possible, avoid the slow path. rint_bypass()
 		 * copies all input to the input queue at once.
 		 */
 		MPASS(iblen > 0);
 		do {
 			rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
 			ibstart += rintlen;
 			iblen -= rintlen;
 			if (iblen == 0) {
 				/* All data written. */
 				break;
 			}
 
 			/* Maybe the device isn't used anyway. */
 			if (psc->pts_flags & PTS_FINISHED) {
 				error = EIO;
 				goto done;
 			}
 
 			/* Wait for more data. */
 			if (fp->f_flag & O_NONBLOCK) {
 				error = EWOULDBLOCK;
 				goto done;
 			}
 
 			/* Wake up users on the slave side. */
 			ttydisc_rint_done(tp);
 			error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
 			if (error != 0)
 				goto done;
 		} while (iblen > 0);
 
 		if (uio->uio_resid == 0)
 			break;
 		tty_unlock(tp);
 	}
 
 done:	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 
 	/*
 	 * Don't account for the part of the buffer that we couldn't
 	 * pass to the TTY.
 	 */
 	uio->uio_resid += iblen;
 	return (error);
 }
 
 static int
 ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0, sig;
 
 	switch (cmd) {
 	case FIODTYPE:
 		*(int *)data = D_TTY;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		tty_lock(tp);
 		if (psc->pts_flags & PTS_FINISHED) {
 			/* Force read() to be called. */
 			*(int *)data = 1;
 		} else {
 			*(int *)data = ttydisc_getc_poll(tp);
 		}
 		tty_unlock(tp);
 		return (0);
 	case FIODGNAME: {
 		struct fiodgname_arg *fgn;
 		const char *p;
 		int i;
 
 		/* Reverse device name lookups, for ptsname() and ttyname(). */
 		fgn = data;
 		p = tty_devname(tp);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			return (EINVAL);
 		return copyout(p, fgn->buf, i);
 	}
 
 	/*
 	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
 	 * called on the pseudo-terminal master, it should not check if
 	 * the terminal is the foreground terminal of the calling
 	 * process.
 	 *
 	 * TIOCGETA is also implemented here. Various Linux PTY routines
 	 * often call isatty(), which is implemented by tcgetattr().
 	 */
 #ifdef PTS_LINUX
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		tty_lock(tp);
 		*(struct termios*)data = tp->t_termios;
 		tty_unlock(tp);
 		return (0);
 #endif /* PTS_LINUX */
 	case TIOCSETAF:
 	case TIOCSETAW:
 		/*
 		 * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
 		 * TCSADRAIN into something different. If an application would
 		 * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
 		 * deadlock waiting for all data to be read.
 		 */
 		cmd = TIOCSETA;
 		break;
 #if defined(PTS_COMPAT) || defined(PTS_LINUX)
 	case TIOCGPTN:
 		/*
 		 * Get the device unit number.
 		 */
 		if (psc->pts_unit < 0)
 			return (ENOTTY);
 		*(unsigned int *)data = psc->pts_unit;
 		return (0);
 #endif /* PTS_COMPAT || PTS_LINUX */
 	case TIOCGPGRP:
 		/* Get the foreground process group ID. */
 		tty_lock(tp);
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		tty_unlock(tp);
 		return (0);
 	case TIOCGSID:
 		/* Get the session leader process ID. */
 		tty_lock(tp);
 		if (tp->t_session == NULL)
 			error = ENOTTY;
 		else
 			*(int *)data = tp->t_session->s_sid;
 		tty_unlock(tp);
 		return (error);
 	case TIOCPTMASTER:
 		/* Yes, we are a pseudo-terminal master. */
 		return (0);
 	case TIOCSIG:
 		/* Signal the foreground process group. */
 		sig = *(int *)data;
 		if (sig < 1 || sig >= NSIG)
 			return (EINVAL);
 
 		tty_lock(tp);
 		tty_signal_pgrp(tp, sig);
 		tty_unlock(tp);
 		return (0);
 	case TIOCPKT:
 		/* Enable/disable packet mode. */
 		tty_lock(tp);
 		if (*(int *)data)
 			psc->pts_flags |= PTS_PKT;
 		else
 			psc->pts_flags &= ~PTS_PKT;
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* Just redirect this ioctl to the slave device. */
 	tty_lock(tp);
 	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
 	tty_unlock(tp);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	return (error);
 }
 
 static int
 ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int revents = 0;
 
 	tty_lock(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		/* Slave device is not opened. */
 		tty_unlock(tp);
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 	}
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can getc something. */
 		if (ttydisc_getc_poll(tp) ||
 		    (psc->pts_flags & PTS_PKT && psc->pts_pkt))
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 	if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can rint something. */
 		if (ttydisc_rint_poll(tp))
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	/*
 	 * No need to check for POLLHUP here. This device cannot be used
 	 * as a callout device, which means we always have a carrier,
 	 * because the master is.
 	 */
 
 	if (revents == 0) {
 		/*
 		 * This code might look misleading, but the naming of
 		 * poll events on this side is the opposite of the slave
 		 * device.
 		 */
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &psc->pts_outpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &psc->pts_inpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 pts_kqops_read_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_read_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_getc_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 pts_kqops_write_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_write_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_rint_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static struct filterops pts_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_read_detach,
 	.f_event = pts_kqops_read_event,
 };
 static struct filterops pts_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_write_detach,
 	.f_event = pts_kqops_write_event,
 };
 
 static int
 ptsdev_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 
 	tty_lock(tp);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pts_kqops_read;
 		knlist_add(&psc->pts_outpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pts_kqops_write;
 		knlist_add(&psc->pts_inpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 #ifdef PTS_EXTERNAL
 	struct pts_softc *psc = tty_softc(tp);
 #endif /* PTS_EXTERNAL */
 	struct cdev *dev = tp->t_dev;
 
 	/*
 	 * According to POSIX, we must implement an fstat(). This also
 	 * makes this implementation compatible with Linux binaries,
 	 * because Linux calls fstat() on the pseudo-terminal master to
 	 * obtain st_rdev.
 	 *
 	 * XXX: POSIX also mentions we must fill in st_dev, but how?
 	 */
 
 	bzero(sb, sizeof *sb);
 #ifdef PTS_EXTERNAL
 	if (psc->pts_cdev != NULL)
 		sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
 	else
 #endif /* PTS_EXTERNAL */
 		sb->st_ino = sb->st_rdev = tty_udev(tp);
 
 	sb->st_atim = dev->si_atime;
 	sb->st_ctim = dev->si_ctime;
 	sb->st_mtim = dev->si_mtime;
 	sb->st_uid = dev->si_uid;
 	sb->st_gid = dev->si_gid;
 	sb->st_mode = dev->si_mode | S_IFCHR;
 
 	return (0);
 }
 
 static int
 ptsdev_close(struct file *fp, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 
 	/* Deallocate TTY device. */
 	tty_lock(tp);
 	tty_rel_gone(tp);
 
 	/*
 	 * Open of /dev/ptmx or /dev/ptyXX changes the type of file
 	 * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
 	 * use count, we need to decrement it, and possibly do other
 	 * required cleanup.
 	 */
 	if (fp->f_vnode != NULL)
 		return (vnops.fo_close(fp, td));
 
 	return (0);
 }
 
 static int
 ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct tty *tp;
 
 	kif->kf_type = KF_TYPE_PTS;
 	tp = fp->f_data;
 	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
+	kif->kf_un.kf_pts.kf_pts_dev_freebsd10 =
+	    kif->kf_un.kf_pts.kf_pts_dev; /* truncate */
 	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
 	return (0);
 }
 
 static struct fileops ptsdev_ops = {
 	.fo_read	= ptsdev_read,
 	.fo_write	= ptsdev_write,
 	.fo_truncate	= invfo_truncate,
 	.fo_ioctl	= ptsdev_ioctl,
 	.fo_poll	= ptsdev_poll,
 	.fo_kqfilter	= ptsdev_kqfilter,
 	.fo_stat	= ptsdev_stat,
 	.fo_close	= ptsdev_close,
 	.fo_chmod	= invfo_chmod,
 	.fo_chown	= invfo_chown,
 	.fo_sendfile	= invfo_sendfile,
 	.fo_fill_kinfo	= ptsdev_fill_kinfo,
 	.fo_flags	= DFLAG_PASSABLE,
 };
 
 /*
  * Driver-side hooks.
  */
 
 static void
 ptsdrv_outwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_outwait);
 	selwakeup(&psc->pts_outpoll);
 	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
 }
 
 static void
 ptsdrv_inwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_inwait);
 	selwakeup(&psc->pts_inpoll);
 	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
 }
 
 static int
 ptsdrv_open(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	psc->pts_flags &= ~PTS_FINISHED;
 
 	return (0);
 }
 
 static void
 ptsdrv_close(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/* Wake up any blocked readers/writers. */
 	psc->pts_flags |= PTS_FINISHED;
 	ptsdrv_outwakeup(tp);
 	ptsdrv_inwakeup(tp);
 }
 
 static void
 ptsdrv_pktnotify(struct tty *tp, char event)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/*
 	 * Clear conflicting flags.
 	 */
 
 	switch (event) {
 	case TIOCPKT_STOP:
 		psc->pts_pkt &= ~TIOCPKT_START;
 		break;
 	case TIOCPKT_START:
 		psc->pts_pkt &= ~TIOCPKT_STOP;
 		break;
 	case TIOCPKT_NOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_DOSTOP;
 		break;
 	case TIOCPKT_DOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_NOSTOP;
 		break;
 	}
 
 	psc->pts_pkt |= event;
 	ptsdrv_outwakeup(tp);
 }
 
 static void
 ptsdrv_free(void *softc)
 {
 	struct pts_softc *psc = softc;
 
 	/* Make device number available again. */
 	if (psc->pts_unit >= 0)
 		free_unr(pts_pool, psc->pts_unit);
 
 	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
 	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
 	crfree(psc->pts_cred);
 
 	seldrain(&psc->pts_inpoll);
 	seldrain(&psc->pts_outpoll);
 	knlist_destroy(&psc->pts_inpoll.si_note);
 	knlist_destroy(&psc->pts_outpoll.si_note);
 
 #ifdef PTS_EXTERNAL
 	/* Destroy master device as well. */
 	if (psc->pts_cdev != NULL)
 		destroy_dev_sched(psc->pts_cdev);
 #endif /* PTS_EXTERNAL */
 
 	free(psc, M_PTS);
 }
 
 static struct ttydevsw pts_class = {
 	.tsw_flags	= TF_NOPREFIX,
 	.tsw_outwakeup	= ptsdrv_outwakeup,
 	.tsw_inwakeup	= ptsdrv_inwakeup,
 	.tsw_open	= ptsdrv_open,
 	.tsw_close	= ptsdrv_close,
 	.tsw_pktnotify	= ptsdrv_pktnotify,
 	.tsw_free	= ptsdrv_free,
 };
 
 #ifndef PTS_EXTERNAL
 static
 #endif /* !PTS_EXTERNAL */
 int
 pts_alloc(int fflags, struct thread *td, struct file *fp)
 {
 	int unit, ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Try to allocate a new pts unit number. */
 	unit = alloc_unr(pts_pool);
 	if (unit < 0) {
 		racct_sub(p, RACCT_NPTS, 1);
 		chgptscnt(cred->cr_ruidinfo, -1, 0);
 		return (EAGAIN);
 	}
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = unit;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 
 #ifdef PTS_EXTERNAL
 int
 pts_alloc_external(int fflags, struct thread *td, struct file *fp,
     struct cdev *dev, const char *name)
 {
 	int ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = -1;
 	psc->pts_cdev = dev;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "%s", name);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 #endif /* PTS_EXTERNAL */
 
 int
 sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
 {
 	int error, fd;
 	struct file *fp;
 
 	/*
 	 * POSIX states it's unspecified when other flags are passed. We
 	 * don't allow this.
 	 */
 	if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd, uap->flags);
 	if (error)
 		return (error);
 
 	/* Allocate the actual pseudo-TTY. */
 	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
 	if (error != 0) {
 		fdclose(td->td_proc->p_fd, fp, fd, td);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	/* Pass it back to userspace. */
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static void
 pts_init(void *unused)
 {
 
 	pts_pool = new_unrhdr(0, INT_MAX, NULL);
 }
 
 SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
Index: projects/ino64/sys/kern/uipc_sem.c
===================================================================
--- projects/ino64/sys/kern/uipc_sem.c	(revision 276110)
+++ projects/ino64/sys/kern/uipc_sem.c	(revision 276111)
@@ -1,1065 +1,1067 @@
 /*-
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
 ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct ksem *ks;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
+	kif->kf_un.kf_sem.kf_sem_mode_freebsd10 =
+	    kif->kf_un.kf_sem.kf_sem_mode; /* truncate */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL)
 			strlcpy(kif->kf_path, ks->ks_path, sizeof(kif->kf_path));
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred, NULL);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct filedesc *fdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	Fnv32_t fnv;
 	int error, fd;
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		error = copyinstr(name, path, MAXPATHLEN, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[0] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(fdp, fp, fd, td);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(fdp, fp, fd, td);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	error = ksem_get(td, uap->id, 0, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	error = ksem_get(td, uap->id, 0, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
Index: projects/ino64/sys/kern/uipc_shm.c
===================================================================
--- projects/ino64/sys/kern/uipc_shm.c	(revision 276110)
+++ projects/ino64/sys/kern/uipc_shm.c	(revision 276111)
@@ -1,1048 +1,1050 @@
 /*-
  * Copyright (c) 2006, 2011 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2) and shm_unlink(2).  While most of the implementation is
  * here, vm_mmap.c contains mapping logic changes.
  *
  * TODO:
  *
  * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
  *     and ipcrm(1) be expanded or should new tools to manage both POSIX
  *     kernel semaphores and POSIX shared memory be written?
  *
  * (2) Add support for this file type to fstat(1).
  *
  * (3) Resource limits?  Does this need its own resource limits or are the
  *     existing limits in mmap(2) sufficient?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr *shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
 static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
 static void	shm_init(void *arg);
 static void	shm_drop(struct shmfd *shmfd);
 static struct shmfd *shm_hold(struct shmfd *shmfd);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	shm_dotruncate(struct shmfd *shmfd, off_t length);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	VM_OBJECT_WLOCK(obj);
 
 	/*
 	 * Parallel reads of the page content from disk are prevented
 	 * by exclusive busy.
 	 *
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, 0);
 			m = vm_page_lookup(obj, idx);
 			if (m == NULL) {
 				printf(
 		    "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
 				    obj, idx, rv);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 			if (rv != VM_PAGER_OK) {
 				printf(
 	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
 				    obj, idx, m->valid, rv);
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 	}
 	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	if (m->queue == PQ_NONE) {
 		vm_page_deactivate(m);
 	} else {
 		/* Requeue to maintain LRU ordering. */
 		vm_page_requeue(m);
 	}
 	vm_page_unlock(m);
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_page_dirty(m);
 		vm_pager_page_unswapped(m);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 static int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
 	vm_page_t m, ma[1];
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	object = shmfd->shm_object;
 	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
 			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
 				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
 				if (m == NULL) {
 					VM_OBJECT_WUNLOCK(object);
 					VM_WAIT;
 					VM_OBJECT_WLOCK(object);
 					goto retry;
 				} else if (m->valid != VM_PAGE_BITS_ALL) {
 					ma[0] = m;
 					rv = vm_pager_get_pages(object, ma, 1,
 					    0);
 					m = vm_page_lookup(object, idx);
 				} else
 					/* A cached page was reactivated. */
 					rv = VM_PAGER_OK;
 				vm_page_lock(m);
 				if (rv == VM_PAGER_OK) {
 					vm_page_deactivate(m);
 					vm_page_unlock(m);
 					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_dirty(m);
 				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = ptoa(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
 			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		/* Attempt to reserve the swap */
 		delta = ptoa(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 static struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 	int ino;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	ino = alloc_unr(shm_ino_unr);
 	if (ino == -1)
 		shmfd->shm_ino = 0;
 	else
 		shmfd->shm_ino = ino;
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 static struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 static void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		if (shmfd->shm_ino != 0)
 			free_unr(shm_ino_unr, shmfd->shm_ino);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 static int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
 	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /* System calls. */
 int
 sys_shm_open(struct thread *td, struct shm_open_args *uap)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
 	    (uap->flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (uap->path == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((uap->flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(fdp, fp, fd, td);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 		error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 #ifdef KTRACE
 		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
 			ktrnamei(path);
 #endif
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[0] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(fdp, fp, fd, td);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
 		}
 
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (uap->flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 			if ((uap->flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(uap->flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(uap->flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (uap->flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					shm_dotruncate(shmfd, 0);
 			}
 			if (error == 0)
 				shm_hold(shmfd);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(fdp, fp, fd, td);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 /*
  * mmap() helper to validate mmap() requests against shm object state
  * and give mmap() the vm_object to use for the mapping.
  */
 int
 shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
     vm_object_t *obj)
 {
 
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (foff >= shmfd->shm_size ||
 	    foff + objsize > round_page(shmfd->shm_size))
 		return (EINVAL);
 
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 	*obj = shmfd->shm_object;
 	return (0);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct shmfd *shmfd;
 
 	kif->kf_type = KF_TYPE_SHM;
 	shmfd = fp->f_data;
 
 	mtx_lock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;	/* XXX */
+	kif->kf_un.kf_file.kf_file_mode_freebsd10 =
+	    kif->kf_un.kf_file.kf_file_mode; /* truncate */
 	mtx_unlock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		sx_slock(&shm_dict_lock);
 		if (shmfd->shm_path != NULL)
 			strlcpy(kif->kf_path, shmfd->shm_path,
 			    sizeof(kif->kf_path));
 		sx_sunlock(&shm_dict_lock);
 	}
 	return (0);
 }
Index: projects/ino64/sys/kern/vfs_vnops.c
===================================================================
--- projects/ino64/sys/kern/vfs_vnops.c	(revision 276110)
+++ projects/ino64/sys/kern/vfs_vnops.c	(revision 276111)
@@ -1,2348 +1,2354 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_stat_t	vn_statfile;
 static fo_close_t	vn_closefile;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 static const int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
 	int *flagp, cmode;
 	struct file *fp;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 
 restart:
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
 		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
 			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				goto restart;
 			}
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 						   &ndp->ni_cnd, vap);
 			vput(ndp->ni_dvp);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = ISOPEN |
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
 		if (!(fmode & FWRITE))
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
 			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
 		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
 			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, td, fp);
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	struct mount *mp;
 	accmode_t accmode;
 	struct flock lf;
 	int error, have_flock, lock_flags, type;
 
 	if (vp->v_type == VLNK)
 		return (EMLINK);
 	if (vp->v_type == VSOCK)
 		return (EOPNOTSUPP);
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR)
 			return (EISDIR);
 		accmode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		accmode |= VREAD;
 	if (fmode & FEXEC)
 		accmode |= VEXEC;
 	if ((fmode & O_APPEND) && (fmode & FWRITE))
 		accmode |= VAPPEND;
 #ifdef MAC
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error)
 		return (error);
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (accmode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
 				return (error);
 		}
 		if (accmode) {
 		        error = VOP_ACCESS(vp, accmode, cred, td);
 			if (error)
 				return (error);
 		}
 	}
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
 		return (error);
 
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		KASSERT(fp != NULL, ("open with flock requires fp"));
 		lock_flags = VOP_ISLOCKED(vp);
 		VOP_UNLOCK(vp, 0);
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 		have_flock = (error == 0);
 		vn_lock(vp, lock_flags | LK_RETRY);
 		if (error == 0 && vp->v_iflag & VI_DOOMED)
 			error = ENOENT;
 		/*
 		 * Another thread might have used this vnode as an
 		 * executable while the vnode lock was dropped.
 		 * Ensure the vnode is still able to be opened for
 		 * writing after the lock has been obtained.
 		 */
 		if (error == 0 && accmode & VWRITE)
 			error = vn_writechk(vp);
 		if (error) {
 			VOP_UNLOCK(vp, 0);
 			if (have_flock) {
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf,
 				    F_FLOCK);
 			}
 			vn_start_write(vp, &mp, V_WAIT);
 			vn_lock(vp, lock_flags | LK_RETRY);
 			(void)VOP_CLOSE(vp, fmode, cred, td);
 			vn_finished_write(mp);
 			/* Prevent second close from fdrop()->vn_close(). */
 			if (fp != NULL)
 				fp->f_ops= &badfileops;
 			return (error);
 		}
 		fp->f_flag |= FHASLOCK;
 	}
 	if (fmode & FWRITE) {
 		VOP_ADD_WRITECOUNT(vp, 1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (0);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, file_cred, td)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 	    MNT_EXTENDED_SHARED(vp->v_mount))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	if (flags & FWRITE) {
 		VNASSERT(vp->v_writecount > 0, vp, 
 		    ("vn_close: negative writecount"));
 		VOP_ADD_WRITECOUNT(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 	    uio->uio_offset == fp->f_nextoff) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		fp->f_seqcount += howmany(uio->uio_resid, 16384);
 		if (fp->f_seqcount > IO_SEQMAX)
 			fp->f_seqcount = IO_SEQMAX;
 		return (fp->f_seqcount << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount > 1)
 		fp->f_seqcount = 1;
 	else
 		fp->f_seqcount = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
 			else
 				lock_flags = LK_EXCLUSIVE;
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp, 0);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
     file_cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	void *base;
 	size_t len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	size_t *aresid;
 	struct thread *td;
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 #if OFF_MAX <= LONG_MAX
 	/*
 	 * Caller only wants the current f_offset value.  Assume that
 	 * the long and shorter integer types reads are atomic.
 	 */
 	if ((flags & FOF_NOLOCK) != 0)
 		return (fp->f_offset);
 #endif
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 #if OFF_MAX <= LONG_MAX
 	if ((flags & FOF_NOLOCK) != 0) {
 		if ((flags & FOF_NOUPDATE) == 0)
 			fp->f_offset = val;
 		if ((flags & FOF_NEXTOFF) != 0)
 			fp->f_nextoff = val;
 		return;
 	}
 #endif
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF) != 0)
 		fp->f_nextoff = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct mtx *mtxp;
 	int error, ioflag;
 	int advice;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    offset != uio->uio_offset) {
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush clean pages and
 		 * buffers for the backing file after a
 		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
 		 * case of using POSIX_FADV_NOREUSE with sequential
 		 * access, track the previous implicit DONTNEED
 		 * request and grow this request to include the
 		 * current read(2) in addition to the previous
 		 * DONTNEED.  With purely sequential access this will
 		 * cause the DONTNEED requests to continously grow to
 		 * cover all of the previously read regions of the
 		 * file.  This allows filesystem blocks that are
 		 * accessed by multiple calls to read(2) to be flushed
 		 * once the last read(2) finishes.
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
 			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
 				start = fp->f_advice->fa_prevstart;
 			else if (fp->f_advice->fa_prevstart != 0 &&
 			    fp->f_advice->fa_prevstart == end + 1)
 				end = fp->f_advice->fa_prevend;
 			fp->f_advice->fa_prevstart = start;
 			fp->f_advice->fa_prevend = end;
 		}
 		mtx_unlock(mtxp);
 		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
 	}
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct mtx *mtxp;
 	int error, ioflag, lock_flags;
 	int advice;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
 
 	advice = get_advice(fp, uio);
 
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
 		lock_flags = LK_SHARED;
 	} else {
 		lock_flags = LK_EXCLUSIVE;
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    offset != uio->uio_offset) {
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush clean pages and
 		 * buffers for the backing file after a
 		 * POSIX_FADV_NOREUSE write(2).  To optimize the
 		 * common case of using POSIX_FADV_NOREUSE with
 		 * sequential access, track the previous implicit
 		 * DONTNEED request and grow this request to include
 		 * the current write(2) in addition to the previous
 		 * DONTNEED.  With purely sequential access this will
 		 * cause the DONTNEED requests to continously grow to
 		 * cover all of the previously written regions of the
 		 * file.
 		 *
 		 * Note that the blocks just written are almost
 		 * certainly still dirty, so this only works when
 		 * VOP_ADVISE() calls from subsequent writes push out
 		 * the data written by this write(2) once the backing
 		 * buffers are clean.  However, as compared to forcing
 		 * IO_DIRECT, this gives much saner behavior.  Write
 		 * clustering is still allowed, and clean pages are
 		 * merely moved to the cache page queue rather than
 		 * outright thrown away.  This means a subsequent
 		 * read(2) can still avoid hitting the disk if the
 		 * pages have not been reclaimed.
 		 *
 		 * This does make POSIX_FADV_NOREUSE largely useless
 		 * with non-sequential access.  However, sequential
 		 * access is the more common use case and the flag is
 		 * merely advisory.
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
 			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
 				start = fp->f_advice->fa_prevstart;
 			else if (fp->f_advice->fa_prevstart != 0 &&
 			    fp->f_advice->fa_prevstart == end + 1)
 				end = fp->f_advice->fa_prevend;
 			fp->f_advice->fa_prevstart = start;
 			fp->f_advice->fa_prevend = end;
 		}
 		mtx_unlock(mtxp);
 		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
 	}
 	
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		return ((args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td));
 	case VN_IO_FAULT_VOP:
 		if (uio->uio_rw == UIO_READ) {
 			return (VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred));
 		} else if (uio->uio_rw == UIO_WRITE) {
 			return (VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred));
 		}
 		break;
 	}
 	panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind,
 	    uio->uio_rw);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, save, saveheld, prev_td_ma_cnt;
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	save = vm_fault_disable_pagefaults();
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > io_hold_cnt * PAGE_SIZE)
 			len = io_hold_cnt * PAGE_SIZE;
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		cnt = atop(end - trunc_page(addr));
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
 	vm_fault_enable_pagefaults(save);
 	free(uio_clone, M_IOV);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 	foffset_lock_uio(fp, uio, flags);
 	if (do_vn_io_fault(vp, uio)) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 		error = vn_io_fault1(vp, uio, &args, td);
 		vn_rangelock_unlock(vp, rl_cookie);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_writechk(vp);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
 	}
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(fp, sb, active_cred, td)
 	struct file *fp;
 	struct stat *sb;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 	VOP_UNLOCK(vp, 0);
 
 	return (error);
 }
 
 /*
  * Stat a vnode; implementation for the stat syscall
  */
 int
 vn_stat(vp, sb, active_cred, file_cred, td)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct ucred *active_cred;
 	struct ucred *file_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 #ifdef MAC
 	error = mac_vnode_check_stat(active_cred, file_cred, vp);
 	if (error)
 		return (error);
 #endif
 
 	vap = &vattr;
 
 	/*
 	 * Initialize defaults for new and unusual fields, so that file
 	 * systems which don't support these fields don't need to know
 	 * about them.
 	 */
 	vap->va_birthtime.tv_sec = -1;
 	vap->va_birthtime.tv_nsec = 0;
 	vap->va_fsid = VNOVAL;
 	vap->va_rdev = NODEV;
 
 	error = VOP_GETATTR(vp, vap, active_cred);
 	if (error)
 		return (error);
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	bzero(sb, sizeof *sb);
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX)
 		return (EOVERFLOW);
 	sb->st_size = vap->va_size;
 	sb->st_atim = vap->va_atime;
 	sb->st_mtim = vap->va_mtime;
 	sb->st_ctim = vap->va_ctime;
 	sb->st_birthtim = vap->va_birthtime;
 
         /*
 	 * According to www.opengroup.org, the meaning of st_blksize is 
 	 *   "a filesystem-specific preferred I/O block size for this 
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
 	 */
 
 	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
 	
 	sb->st_flags = vap->va_flags;
 	if (priv_check(td, PRIV_VFS_GENERATION))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, active_cred, td)
 	struct file *fp;
 	u_long com;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vattr vattr;
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			error = VOP_GETATTR(vp, &vattr, active_cred);
 			VOP_UNLOCK(vp, 0);
 			if (error == 0)
 				*(int *)data = vattr.va_size - fp->f_offset;
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #ifdef MAC
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 	VOP_UNLOCK(vp, 0);
 	if (!error)
 #endif
 
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 int
 _vn_lock(struct vnode *vp, int flags, char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock called with no locktype."));
 	do {
 #ifdef DEBUG_VFS_LOCKS
 		KASSERT(vp->v_holdcnt != 0,
 		    ("vn_lock %p: zero hold count", vp));
 #endif
 		error = VOP_LOCK1(vp, flags, file, line);
 		flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
 		    ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
 		    flags, error));
 		/*
 		 * Callers specify LK_RETRY if they wish to get dead vnodes.
 		 * If RETRY is not set, we return ENOENT instead.
 		 */
 		if (error == 0 && vp->v_iflag & VI_DOOMED &&
 		    (flags & LK_RETRY) == 0) {
 			VOP_UNLOCK(vp, 0);
 			error = ENOENT;
 			break;
 		}
 	} while (flags & LK_RETRY && error != 0);
 	return (error);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
 
 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK)
 		vref(vp);
 
 	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
 
 	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 static bool
 vn_suspendable_mp(struct mount *mp)
 {
 
 	return ((mp->mnt_kern_flag & MNTK_SUSPENDABLE) != 0);
 }
 
 static bool
 vn_suspendable(struct vnode *vp, struct mount **mpp)
 {
 
 	if (vp != NULL)
 		*mpp = vp->v_mount;
 	if (*mpp == NULL)
 		return (false);
 
 	return (vn_suspendable_mp(*mpp));
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_locked(struct mount *mp, int flags)
 {
 	int error, mflags;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
 		    (flags & PCATCH) : 0) | (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if (flags & V_NOWAIT) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error)
 				goto unlock;
 		}
 	}
 	if (flags & V_XSLEEP)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
 	if (!vn_suspendable(vp, mpp))
 		return (0);
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 
 	return (vn_start_write_locked(mp, flags));
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
 	if (!vn_suspendable(vp, mpp))
 		return (0);
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
 	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
 	    "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL || !vn_suspendable_mp(mp))
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_writeopcount--;
 	if (mp->mnt_writeopcount < 0)
 		panic("vn_finished_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_writeopcount <= 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(mp)
 	struct mount *mp;
 {
 	if (mp == NULL || !vn_suspendable_mp(mp))
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	MPASS(vn_suspendable_mp(mp));
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
 		vfs_write_resume(mp, 0);
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MPASS(vn_suspendable_mp(mp));
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_locked(mp, 0);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	MPASS(vn_suspendable_mp(mp));
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp, 0);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp, 0);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (vp->v_iflag & VI_DOOMED) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp, 0);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (*rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     const struct thread *td)
 {
 
 	if (vp->v_type != VREG || td == NULL)
 		return (0);
 	PROC_LOCK(td->td_proc);
 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
 	    lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 		kern_psignal(td->td_proc, SIGXFSZ);
 		PROC_UNLOCK(td->td_proc);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (0);
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp, 0);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp, 0);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	struct vattr va;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("Wrong command %lu", cmd));
 
 	if (vn_lock(vp, LK_SHARED) != 0)
 		return (EBADF);
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto unlock;
 	}
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		goto unlock;
 	noff = *off;
 	if (noff >= va.va_size) {
 		error = ENXIO;
 		goto unlock;
 	}
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto unlock;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto unlock;
 		}
 	}
 	if (noff > va.va_size)
 		noff = va.va_size;
 	/* noff == va.va_size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 unlock:
 	VOP_UNLOCK(vp, 0);
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t foffset, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 	noneg = (vp->v_type != VCHR);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		VOP_UNLOCK(vp, 0);
 		if (error)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			vattr.va_size = size;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp, 0);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
+	kif->kf_un.kf_file.kf_file_fsid_freebsd10 =
+	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
+	kif->kf_un.kf_file.kf_file_mode_freebsd10 =
+	    kif->kf_un.kf_file.kf_file_mode; /* truncate */
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
+	kif->kf_un.kf_file.kf_file_rdev_freebsd10 =
+	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 	return (0);
 }
Index: projects/ino64/sys/sys/user.h
===================================================================
--- projects/ino64/sys/sys/user.h	(revision 276110)
+++ projects/ino64/sys/sys/user.h	(revision 276111)
@@ -1,540 +1,566 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2007 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)user.h	8.2 (Berkeley) 9/23/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_USER_H_
 #define _SYS_USER_H_
 
 #include <machine/pcb.h>
 #ifndef _KERNEL
 /* stuff that *used* to be included by user.h, or is now needed */
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/proc.h>
 #include <vm/vm.h>		/* XXX */
 #include <vm/vm_param.h>	/* XXX */
 #include <vm/pmap.h>		/* XXX */
 #include <vm/vm_map.h>		/* XXX */
 #endif /* !_KERNEL */
 #ifndef _SYS_RESOURCEVAR_H_
 #include <sys/resourcevar.h>
 #endif
 #ifndef _SYS_SIGNALVAR_H_
 #include <sys/signalvar.h>
 #endif
 #ifndef _SYS_SOCKET_VAR_H_
 #include <sys/socket.h>
 #endif
 #include <sys/caprights.h>
 
 /*
  * KERN_PROC subtype ops return arrays of selected proc structure entries:
  *
  * This struct includes several arrays of spare space, with different arrays
  * for different standard C-types.  When adding new variables to this struct,
  * the space for byte-aligned data should be taken from the ki_sparestring,
  * pointers from ki_spareptrs, word-aligned data from ki_spareints, and
  * doubleword-aligned data from ki_sparelongs.  Make sure the space for new
  * variables come from the array which matches the size and alignment of
  * those variables on ALL hardware platforms, and then adjust the appropriate
  * KI_NSPARE_* value(s) to match.
  *
  * Always verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE on all
  * platforms after you have added new variables.  Note that if you change
  * the value of KINFO_PROC_SIZE, then many userland programs will stop
  * working until they are recompiled!
  *
  * Once you have added the new field, you will need to add code to initialize
  * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and
  * function kvm_proclist in lib/libkvm/kvm_proc.c .
  */
-#define	KI_NSPARE_INT	4
+#define	KI_NSPARE_INT	2
 #define	KI_NSPARE_LONG	12
 #define	KI_NSPARE_PTR	6
 
 #ifndef _KERNEL
 #ifndef KINFO_PROC_SIZE
 #error "Unknown architecture"
 #endif
 #endif /* !_KERNEL */
 
 #define	WMESGLEN	8		/* size of returned wchan message */
 #define	LOCKNAMELEN	8		/* size of returned lock name */
 #define	TDNAMLEN	16		/* size of returned thread name */
 #define	COMMLEN		19		/* size of returned ki_comm name */
 #define	KI_EMULNAMELEN	16		/* size of returned ki_emul */
 #define	KI_NGROUPS	16		/* number of groups in ki_groups */
 #define	LOGNAMELEN	17		/* size of returned ki_login */
 #define	LOGINCLASSLEN	17		/* size of returned ki_loginclass */
 
 #ifndef BURN_BRIDGES
 #define	OCOMMLEN	TDNAMLEN	
 #define	ki_ocomm	ki_tdname
 #endif
 
 /* Flags for the process credential. */
 #define	KI_CRF_CAPABILITY_MODE	0x00000001
 /*
  * Steal a bit from ki_cr_flags to indicate that the cred had more than
  * KI_NGROUPS groups.
  */
 #define KI_CRF_GRP_OVERFLOW	0x80000000
 
 struct kinfo_proc {
 	int	ki_structsize;		/* size of this structure */
 	int	ki_layout;		/* reserved: layout identifier */
 	struct	pargs *ki_args;		/* address of command arguments */
 	struct	proc *ki_paddr;		/* address of proc */
 	struct	user *ki_addr;		/* kernel virtual addr of u-area */
 	struct	vnode *ki_tracep;	/* pointer to trace file */
 	struct	vnode *ki_textvp;	/* pointer to executable file */
 	struct	filedesc *ki_fd;	/* pointer to open file info */
 	struct	vmspace *ki_vmspace;	/* pointer to kernel vmspace struct */
 	void	*ki_wchan;		/* sleep address */
 	pid_t	ki_pid;			/* Process identifier */
 	pid_t	ki_ppid;		/* parent process id */
 	pid_t	ki_pgid;		/* process group id */
 	pid_t	ki_tpgid;		/* tty process group id */
 	pid_t	ki_sid;			/* Process session ID */
 	pid_t	ki_tsid;		/* Terminal session ID */
 	short	ki_jobc;		/* job control counter */
 	short	ki_spare_short1;	/* unused (just here for alignment) */
-	dev_t	ki_tdev;		/* controlling tty dev */
+	uint32_t ki_tdev_freebsd10;	/* controlling tty dev */
 	sigset_t ki_siglist;		/* Signals arrived but not delivered */
 	sigset_t ki_sigmask;		/* Current signal mask */
 	sigset_t ki_sigignore;		/* Signals being ignored */
 	sigset_t ki_sigcatch;		/* Signals being caught by user */
 	uid_t	ki_uid;			/* effective user id */
 	uid_t	ki_ruid;		/* Real user id */
 	uid_t	ki_svuid;		/* Saved effective user id */
 	gid_t	ki_rgid;		/* Real group id */
 	gid_t	ki_svgid;		/* Saved effective group id */
 	short	ki_ngroups;		/* number of groups */
 	short	ki_spare_short2;	/* unused (just here for alignment) */
 	gid_t	ki_groups[KI_NGROUPS];	/* groups */
 	vm_size_t ki_size;		/* virtual size */
 	segsz_t ki_rssize;		/* current resident set size in pages */
 	segsz_t ki_swrss;		/* resident set size before last swap */
 	segsz_t ki_tsize;		/* text size (pages) XXX */
 	segsz_t ki_dsize;		/* data size (pages) XXX */
 	segsz_t ki_ssize;		/* stack size (pages) */
 	u_short	ki_xstat;		/* Exit status for wait & stop signal */
 	u_short	ki_acflag;		/* Accounting flags */
 	fixpt_t	ki_pctcpu;	 	/* %cpu for process during ki_swtime */
 	u_int	ki_estcpu;	 	/* Time averaged value of ki_cpticks */
 	u_int	ki_slptime;	 	/* Time since last blocked */
 	u_int	ki_swtime;	 	/* Time swapped in or out */
 	u_int	ki_cow;			/* number of copy-on-write faults */
 	u_int64_t ki_runtime;		/* Real time in microsec */
 	struct	timeval ki_start;	/* starting time */
 	struct	timeval ki_childtime;	/* time used by process children */
 	long	ki_flag;		/* P_* flags */
 	long	ki_kiflag;		/* KI_* flags (below) */
 	int	ki_traceflag;		/* Kernel trace points */
 	char	ki_stat;		/* S* process status */
 	signed char ki_nice;		/* Process "nice" value */
 	char	ki_lock;		/* Process lock (prevent swap) count */
 	char	ki_rqindex;		/* Run queue index */
 	u_char	ki_oncpu_old;		/* Which cpu we are on (legacy) */
 	u_char	ki_lastcpu_old;		/* Last cpu we were on (legacy) */
 	char	ki_tdname[TDNAMLEN+1];	/* thread name */
 	char	ki_wmesg[WMESGLEN+1];	/* wchan message */
 	char	ki_login[LOGNAMELEN+1];	/* setlogin name */
 	char	ki_lockname[LOCKNAMELEN+1]; /* lock name */
 	char	ki_comm[COMMLEN+1];	/* command name */
 	char	ki_emul[KI_EMULNAMELEN+1];  /* emulation name */
 	char	ki_loginclass[LOGINCLASSLEN+1]; /* login class */
 	/*
 	 * When adding new variables, take space for char-strings from the
 	 * front of ki_sparestrings, and ints from the end of ki_spareints.
 	 * That way the spare room from both arrays will remain contiguous.
 	 */
 	char	ki_sparestrings[50];	/* spare string space */
 	int	ki_spareints[KI_NSPARE_INT];	/* spare room for growth */
+	uint64_t ki_tdev;		/* controlling tty dev */
 	int	ki_oncpu;		/* Which cpu we are on */
 	int	ki_lastcpu;		/* Last cpu we were on */
 	int	ki_tracer;		/* Pid of tracing process */
 	int	ki_flag2;		/* P2_* flags */
 	int	ki_fibnum;		/* Default FIB number */
 	u_int	ki_cr_flags;		/* Credential flags */
 	int	ki_jid;			/* Process jail ID */
 	int	ki_numthreads;		/* XXXKSE number of threads in total */
 	lwpid_t	ki_tid;			/* XXXKSE thread id */
 	struct	priority ki_pri;	/* process priority */
 	struct	rusage ki_rusage;	/* process rusage statistics */
 	/* XXX - most fields in ki_rusage_ch are not (yet) filled in */
 	struct	rusage ki_rusage_ch;	/* rusage of children processes */
 	struct	pcb *ki_pcb;		/* kernel virtual addr of pcb */
 	void	*ki_kstack;		/* kernel virtual addr of stack */
 	void	*ki_udata;		/* User convenience pointer */
 	struct	thread *ki_tdaddr;	/* address of thread */
 	/*
 	 * When adding new variables, take space for pointers from the
 	 * front of ki_spareptrs, and longs from the end of ki_sparelongs.
 	 * That way the spare room from both arrays will remain contiguous.
 	 */
 	void	*ki_spareptrs[KI_NSPARE_PTR];	/* spare room for growth */
 	long	ki_sparelongs[KI_NSPARE_LONG];	/* spare room for growth */
 	long	ki_sflag;		/* PS_* flags */
 	long	ki_tdflags;		/* XXXKSE kthread flag */
 };
 void fill_kinfo_proc(struct proc *, struct kinfo_proc *);
 /* XXX - the following two defines are temporary */
 #define	ki_childstime	ki_rusage_ch.ru_stime
 #define	ki_childutime	ki_rusage_ch.ru_utime
 
 /*
  *  Legacy PS_ flag.  This moved to p_flag but is maintained for
  *  compatibility.
  */
 #define	PS_INMEM	0x00001		/* Loaded into memory. */
 
 /* ki_sessflag values */
 #define	KI_CTTY		0x00000001	/* controlling tty vnode active */
 #define	KI_SLEADER	0x00000002	/* session leader */
 #define	KI_LOCKBLOCK	0x00000004	/* proc blocked on lock ki_lockname */
 
 /*
  * This used to be the per-process structure containing data that
  * isn't needed in core when the process is swapped out, but now it
  * remains only for the benefit of a.out core dumps.
  */
 struct user {
 	struct	pstats u_stats;		/* *p_stats */
 	struct	kinfo_proc u_kproc;	/* eproc */
 };
 
 /*
  * The KERN_PROC_FILE sysctl allows a process to dump the file descriptor
  * array of another process.
  */
 #define	KF_ATTR_VALID	0x0001
 
 #define	KF_TYPE_NONE	0
 #define	KF_TYPE_VNODE	1
 #define	KF_TYPE_SOCKET	2
 #define	KF_TYPE_PIPE	3
 #define	KF_TYPE_FIFO	4
 #define	KF_TYPE_KQUEUE	5
 #define	KF_TYPE_CRYPTO	6
 #define	KF_TYPE_MQUEUE	7
 #define	KF_TYPE_SHM	8
 #define	KF_TYPE_SEM	9
 #define	KF_TYPE_PTS	10
 #define	KF_TYPE_PROCDESC	11
 #define	KF_TYPE_UNKNOWN	255
 
 #define	KF_VTYPE_VNON	0
 #define	KF_VTYPE_VREG	1
 #define	KF_VTYPE_VDIR	2
 #define	KF_VTYPE_VBLK	3
 #define	KF_VTYPE_VCHR	4
 #define	KF_VTYPE_VLNK	5
 #define	KF_VTYPE_VSOCK	6
 #define	KF_VTYPE_VFIFO	7
 #define	KF_VTYPE_VBAD	8
 #define	KF_VTYPE_UNKNOWN	255
 
 #define	KF_FD_TYPE_CWD	-1	/* Current working directory */
 #define	KF_FD_TYPE_ROOT	-2	/* Root directory */
 #define	KF_FD_TYPE_JAIL	-3	/* Jail directory */
 #define	KF_FD_TYPE_TRACE	-4	/* Ktrace vnode */
 #define	KF_FD_TYPE_TEXT	-5	/* Text vnode */
 #define	KF_FD_TYPE_CTTY	-6	/* Controlling terminal */
 
 #define	KF_FLAG_READ		0x00000001
 #define	KF_FLAG_WRITE		0x00000002
 #define	KF_FLAG_APPEND		0x00000004
 #define	KF_FLAG_ASYNC		0x00000008
 #define	KF_FLAG_FSYNC		0x00000010
 #define	KF_FLAG_NONBLOCK	0x00000020
 #define	KF_FLAG_DIRECT		0x00000040
 #define	KF_FLAG_HASLOCK		0x00000080
 #define	KF_FLAG_SHLOCK		0x00000100
 #define	KF_FLAG_EXLOCK		0x00000200
 #define	KF_FLAG_NOFOLLOW	0x00000400
 #define	KF_FLAG_CREAT		0x00000800
 #define	KF_FLAG_TRUNC		0x00001000
 #define	KF_FLAG_EXCL		0x00002000
 #define	KF_FLAG_EXEC		0x00004000
 
 /*
  * Old format.  Has variable hidden padding due to alignment.
  * This is a compatability hack for pre-build 7.1 packages.
  */
 #if defined(__amd64__)
 #define	KINFO_OFILE_SIZE	1328
 #endif
 #if defined(__i386__)
 #define	KINFO_OFILE_SIZE	1324
 #endif
 
 struct kinfo_ofile {
 	int	kf_structsize;			/* Size of kinfo_file. */
 	int	kf_type;			/* Descriptor type. */
 	int	kf_fd;				/* Array index. */
 	int	kf_ref_count;			/* Reference count. */
 	int	kf_flags;			/* Flags. */
 	/* XXX Hidden alignment padding here on amd64 */
 	off_t	kf_offset;			/* Seek location. */
 	int	kf_vnode_type;			/* Vnode type. */
 	int	kf_sock_domain;			/* Socket domain. */
 	int	kf_sock_type;			/* Socket type. */
 	int	kf_sock_protocol;		/* Socket protocol. */
 	char	kf_path[PATH_MAX];	/* Path to file, if any. */
 	struct sockaddr_storage kf_sa_local;	/* Socket address. */
 	struct sockaddr_storage	kf_sa_peer;	/* Peer address. */
 };
 
 #if defined(__amd64__) || defined(__i386__)
 /*
  * This size should never be changed. If you really need to, you must provide
  * backward ABI compatibility by allocating a new sysctl MIB that will return
  * the new structure. The current structure has to be returned by the current
  * sysctl MIB. See how it is done for the kinfo_ofile structure.
  */
 #define	KINFO_FILE_SIZE	1392
 #endif
 
 struct kinfo_file {
 	int		kf_structsize;		/* Variable size of record. */
 	int		kf_type;		/* Descriptor type. */
 	int		kf_fd;			/* Array index. */
 	int		kf_ref_count;		/* Reference count. */
 	int		kf_flags;		/* Flags. */
 	int		kf_pad0;		/* Round to 64 bit alignment. */
 	int64_t		kf_offset;		/* Seek location. */
 	int		kf_vnode_type;		/* Vnode type. */
 	int		kf_sock_domain;		/* Socket domain. */
 	int		kf_sock_type;		/* Socket type. */
 	int		kf_sock_protocol;	/* Socket protocol. */
-	struct sockaddr_storage kf_sa_local;	/* Socket address. */
-	struct sockaddr_storage	kf_sa_peer;	/* Peer address. */
 	union {
 		struct {
+			/* Socket address. */
+			struct sockaddr_storage kf_sa_local;
+			/* Peer address. */
+			struct sockaddr_storage	kf_sa_peer;
 			/* Address of so_pcb. */
 			uint64_t	kf_sock_pcb;
 			/* Address of inp_ppcb. */
 			uint64_t	kf_sock_inpcb;
 			/* Address of unp_conn. */
 			uint64_t	kf_sock_unpconn;
 			/* Send buffer state. */
 			uint16_t	kf_sock_snd_sb_state;
 			/* Receive buffer state. */
 			uint16_t	kf_sock_rcv_sb_state;
 			/* Round to 64 bit alignment. */
 			uint32_t	kf_sock_pad0;
 		} kf_sock;
 		struct {
+			/* Space for future use */
+			uint64_t	kf_spareint64[30];
+			/* Vnode filesystem id. */
+			uint64_t	kf_file_fsid;
+			/* File device. */
+			uint64_t	kf_file_rdev;
 			/* Global file id. */
 			uint64_t	kf_file_fileid;
 			/* File size. */
 			uint64_t	kf_file_size;
-			/* Vnode filesystem id. */
-			uint32_t	kf_file_fsid;
-			/* File device. */
-			uint32_t	kf_file_rdev;
-			/* File mode. */
-			uint16_t	kf_file_mode;
+			/* Vnode filesystem id, FreeBSD 10 compat. */
+			uint32_t	kf_file_fsid_freebsd10;
+			/* File device, FreeBSD 10 compat. */
+			uint32_t	kf_file_rdev_freebsd10;
+			/* File mode, FreeBSD 10 compat. */
+			uint16_t	kf_file_mode_freebsd10;
 			/* Round to 64 bit alignment. */
 			uint16_t	kf_file_pad0;
-			uint32_t	kf_file_pad1;
+			/* File mode. */
+			uint32_t	kf_file_mode;
 		} kf_file;
 		struct {
+			/* Space for future use */
+			uint64_t	kf_spareint64[32];
 			uint32_t	kf_sem_value;
-			uint16_t	kf_sem_mode;
+			uint16_t	kf_sem_mode_freebsd10;
+			uint16_t	kf_sem_pad0;
+			uint32_t	kf_sem_mode;
 		} kf_sem;
 		struct {
+			/* Space for future use */
+			uint64_t	kf_spareint64[32];
 			uint64_t	kf_pipe_addr;
 			uint64_t	kf_pipe_peer;
 			uint32_t	kf_pipe_buffer_cnt;
 			/* Round to 64 bit alignment. */
 			uint32_t	kf_pipe_pad0[3];
 		} kf_pipe;
 		struct {
-			uint32_t	kf_pts_dev;
+			/* Space for future use */
+			uint64_t	kf_spareint64[32];
+			uint32_t	kf_pts_dev_freebsd10;
+			uint32_t	kf_pts_pad0;
+			uint64_t	kf_pts_dev;
 			/* Round to 64 bit alignment. */
-			uint32_t	kf_pts_pad0[7];
+			uint32_t	kf_pts_pad1[4];
 		} kf_pts;
 		struct {
+			/* Space for future use */
+			uint64_t	kf_spareint64[32];
 			pid_t		kf_pid;
 		} kf_proc;
 	} kf_un;
 	uint16_t	kf_status;		/* Status flags. */
 	uint16_t	kf_pad1;		/* Round to 32 bit alignment. */
 	int		_kf_ispare0;		/* Space for more stuff. */
 	cap_rights_t	kf_cap_rights;		/* Capability rights. */
 	uint64_t	_kf_cap_spare;		/* Space for future cap_rights_t. */
 	/* Truncated before copyout in sysctl */
 	char		kf_path[PATH_MAX];	/* Path to file, if any. */
 };
 
 /*
  * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of
  * another process as a series of entries.
  */
 #define	KVME_TYPE_NONE		0
 #define	KVME_TYPE_DEFAULT	1
 #define	KVME_TYPE_VNODE		2
 #define	KVME_TYPE_SWAP		3
 #define	KVME_TYPE_DEVICE	4
 #define	KVME_TYPE_PHYS		5
 #define	KVME_TYPE_DEAD		6
 #define	KVME_TYPE_SG		7
 #define	KVME_TYPE_MGTDEVICE	8
 #define	KVME_TYPE_UNKNOWN	255
 
 #define	KVME_PROT_READ		0x00000001
 #define	KVME_PROT_WRITE		0x00000002
 #define	KVME_PROT_EXEC		0x00000004
 
 #define	KVME_FLAG_COW		0x00000001
 #define	KVME_FLAG_NEEDS_COPY	0x00000002
 #define	KVME_FLAG_NOCOREDUMP	0x00000004
 #define	KVME_FLAG_SUPER		0x00000008
 #define	KVME_FLAG_GROWS_UP	0x00000010
 #define	KVME_FLAG_GROWS_DOWN	0x00000020
 
 #if defined(__amd64__)
 #define	KINFO_OVMENTRY_SIZE	1168
 #endif
 #if defined(__i386__)
 #define	KINFO_OVMENTRY_SIZE	1128
 #endif
 
 struct kinfo_ovmentry {
 	int	 kve_structsize;		/* Size of kinfo_vmmapentry. */
 	int	 kve_type;			/* Type of map entry. */
 	void	*kve_start;			/* Starting address. */
 	void	*kve_end;			/* Finishing address. */
 	int	 kve_flags;			/* Flags on map entry. */
 	int	 kve_resident;			/* Number of resident pages. */
 	int	 kve_private_resident;		/* Number of private pages. */
 	int	 kve_protection;		/* Protection bitmask. */
 	int	 kve_ref_count;			/* VM obj ref count. */
 	int	 kve_shadow_count;		/* VM obj shadow count. */
 	char	 kve_path[PATH_MAX];		/* Path to VM obj, if any. */
 	void	*_kve_pspare[8];		/* Space for more stuff. */
 	off_t	 kve_offset;			/* Mapping offset in object */
 	uint64_t kve_fileid;			/* inode number if vnode */
-	dev_t	 kve_fsid;			/* dev_t of vnode location */
+	uint32_t kve_fsid;			/* dev_t of vnode location */
 	int	 _kve_ispare[3];		/* Space for more stuff. */
 };
 
 #if defined(__amd64__) || defined(__i386__)
 #define	KINFO_VMENTRY_SIZE	1160
 #endif
 
 struct kinfo_vmentry {
 	int	 kve_structsize;		/* Variable size of record. */
 	int	 kve_type;			/* Type of map entry. */
 	uint64_t kve_start;			/* Starting address. */
 	uint64_t kve_end;			/* Finishing address. */
 	uint64_t kve_offset;			/* Mapping offset in object */
 	uint64_t kve_vn_fileid;			/* inode number if vnode */
-	uint32_t kve_vn_fsid;			/* dev_t of vnode location */
+	uint32_t kve_vn_fsid_freebsd10;		/* dev_t of vnode location */
 	int	 kve_flags;			/* Flags on map entry. */
 	int	 kve_resident;			/* Number of resident pages. */
 	int	 kve_private_resident;		/* Number of private pages. */
 	int	 kve_protection;		/* Protection bitmask. */
 	int	 kve_ref_count;			/* VM obj ref count. */
 	int	 kve_shadow_count;		/* VM obj shadow count. */
 	int	 kve_vn_type;			/* Vnode type. */
 	uint64_t kve_vn_size;			/* File size. */
-	uint32_t kve_vn_rdev;			/* Device id if device. */
-	uint16_t kve_vn_mode;			/* File mode. */
+	uint32_t kve_vn_rdev_freebsd10;		/* Device id if device. */
+	uint16_t kve_vn_mode_freebsd10;		/* File mode. */
 	uint16_t kve_status;			/* Status flags. */
-	int	 _kve_ispare[12];		/* Space for more stuff. */
+	uint64_t kve_vn_fsid;			/* dev_t of vnode location */
+	uint64_t kve_vn_rdev;			/* Device id if device. */
+	uint32_t kve_vn_mode;			/* File mode. */
+	uint32_t _kve_ispare0;			/* Space for more stuff. */
+	int	 _kve_ispare[6];		/* Space for more stuff. */
 	/* Truncated before copyout in sysctl */
 	char	 kve_path[PATH_MAX];		/* Path to VM obj, if any. */
 };
 
 /*
  * The KERN_PROC_KSTACK sysctl allows a process to dump the kernel stacks of
  * another process as a series of entries.  Each stack is represented by a
  * series of symbol names and offsets as generated by stack_sbuf_print(9).
  */
 #define	KKST_MAXLEN	1024
 
 #define	KKST_STATE_STACKOK	0		/* Stack is valid. */
 #define	KKST_STATE_SWAPPED	1		/* Stack swapped out. */
 #define	KKST_STATE_RUNNING	2		/* Stack ephemeral. */
 
 #if defined(__amd64__) || defined(__i386__)
 #define	KINFO_KSTACK_SIZE	1096
 #endif
 
 struct kinfo_kstack {
 	lwpid_t	 kkst_tid;			/* ID of thread. */
 	int	 kkst_state;			/* Validity of stack. */
 	char	 kkst_trace[KKST_MAXLEN];	/* String representing stack. */
 	int	 _kkst_ispare[16];		/* Space for more stuff. */
 };
 
 struct kinfo_sigtramp {
 	void	*ksigtramp_start;
 	void	*ksigtramp_end;
 	void	*ksigtramp_spare[4];
 };
 
 #ifdef _KERNEL
 /* Flags for kern_proc_out function. */
 #define KERN_PROC_NOTHREADS	0x1
 #define KERN_PROC_MASK32	0x2
 
 struct sbuf;
 
 /*
  * The kern_proc out functions are helper functions to dump process
  * miscellaneous kinfo structures to sbuf.  The main consumers are KERN_PROC
  * sysctls but they may also be used by other kernel subsystems.
  *
  * The functions manipulate the process locking state and expect the process
  * to be locked on enter.  On return the process is unlocked.
  */
 
 int	kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen);
 int	kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen);
 int	kern_proc_out(struct proc *p, struct sbuf *sb, int flags);
 int	kern_proc_vmmap_out(struct proc *p, struct sbuf *sb);
 
 int	vntype_to_kinfo(int vtype);
 #endif /* !_KERNEL */
 
 #endif